[clang] 23f0269 - [X86] Add AVX-VNNI-INT8 instructions.
Freddy Ye via cfe-commits
cfe-commits at lists.llvm.org
Thu Oct 27 19:40:38 PDT 2022
Author: Freddy Ye
Date: 2022-10-28T10:39:54+08:00
New Revision: 23f02693ec58efef6951fcaa689d26b440a25968
URL: https://github.com/llvm/llvm-project/commit/23f02693ec58efef6951fcaa689d26b440a25968
DIFF: https://github.com/llvm/llvm-project/commit/23f02693ec58efef6951fcaa689d26b440a25968.diff
LOG: [X86] Add AVX-VNNI-INT8 instructions.
For more details about these instructions, please refer to the latest ISE document: https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html
Reviewed By: pengfei, skan
Differential Revision: https://reviews.llvm.org/D135938
Added:
clang/lib/Headers/avxvnniint8intrin.h
clang/test/CodeGen/avxvnniint8-builtins.c
llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt
llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt
llvm/test/MC/X86/avx_vnni_int8-32-att.s
llvm/test/MC/X86/avx_vnni_int8-32-intel.s
llvm/test/MC/X86/avx_vnni_int8-64-att.s
llvm/test/MC/X86/avx_vnni_int8-64-intel.s
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Basic/BuiltinsX86.def
clang/include/clang/Driver/Options.td
clang/lib/Basic/Targets/X86.cpp
clang/lib/Basic/Targets/X86.h
clang/lib/Headers/CMakeLists.txt
clang/lib/Headers/cpuid.h
clang/lib/Headers/immintrin.h
clang/test/CodeGen/attr-target-x86.c
clang/test/Driver/x86-target-features.c
clang/test/Preprocessor/x86_target_features.c
llvm/docs/ReleaseNotes.rst
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/include/llvm/Support/X86TargetParser.def
llvm/lib/Support/Host.cpp
llvm/lib/Support/X86TargetParser.cpp
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.h
llvm/lib/Target/X86/X86InstrFoldTables.cpp
llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86InstrInfo.td
llvm/lib/Target/X86/X86InstrSSE.td
llvm/lib/Target/X86/X86IntrinsicsInfo.h
Removed:
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d40475cd92b84..dd859c9269cfd 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -648,6 +648,10 @@ X86 Support in Clang
- Support ISA of ``AVX-IFMA``.
* Support intrinsic of ``_mm(256)_madd52hi_avx_epu64``.
* Support intrinsic of ``_mm(256)_madd52lo_avx_epu64``.
+- Support ISA of ``AVX-VNNI-INT8``.
+ * Support intrinsic of ``_mm(256)_dpbssd(s)_epi32``.
+ * Support intrinsic of ``_mm(256)_dpbsud(s)_epi32``.
+ * Support intrinsic of ``_mm(256)_dpbuud(s)_epi32``.
WebAssembly Support in Clang
----------------------------
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index d7ed93885020f..3fc77c95b7664 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -906,6 +906,7 @@ TARGET_BUILTIN(__builtin_ia32_alignq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl
TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f")
TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f")
+// AVX-VNNI and AVX512-VNNI
TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
@@ -919,6 +920,20 @@ TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512v
TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
+// AVX-VNNI-INT8
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+
TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl")
TARGET_BUILTIN(__builtin_ia32_gather3div4df, "V4dV4dvC*V4OiUcIi", "nV:256:", "avx512vl")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0e80afa73b2a4..7c5974813ffa4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4590,6 +4590,8 @@ def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Featur
def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
def mavxifma : Flag<["-"], "mavxifma">, Group<m_x86_Features_Group>;
def mno_avxifma : Flag<["-"], "mno-avxifma">, Group<m_x86_Features_Group>;
+def mavxvnniint8 : Flag<["-"], "mavxvnniint8">, Group<m_x86_Features_Group>;
+def mno_avxvnniint8 : Flag<["-"], "mno-avxvnniint8">, Group<m_x86_Features_Group>;
def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group<m_x86_Features_Group>;
def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 78c032e2d7d4b..442503a13025c 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -342,6 +342,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAVXIFMA = true;
} else if (Feature == "+avxvnni") {
HasAVXVNNI = true;
+ } else if (Feature == "+avxvnniint8") {
+ HasAVXVNNIINT8 = true;
} else if (Feature == "+serialize") {
HasSERIALIZE = true;
} else if (Feature == "+tsxldtrk") {
@@ -796,6 +798,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AVXIFMA__");
if (HasAVXVNNI)
Builder.defineMacro("__AVXVNNI__");
+ if (HasAVXVNNIINT8)
+ Builder.defineMacro("__AVXVNNIINT8__");
if (HasSERIALIZE)
Builder.defineMacro("__SERIALIZE__");
if (HasTSXLDTRK)
@@ -920,6 +924,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avx512vp2intersect", true)
.Case("avxifma", true)
.Case("avxvnni", true)
+ .Case("avxvnniint8", true)
.Case("bmi", true)
.Case("bmi2", true)
.Case("cldemote", true)
@@ -1019,6 +1024,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("avx512vp2intersect", HasAVX512VP2INTERSECT)
.Case("avxifma", HasAVXIFMA)
.Case("avxvnni", HasAVXVNNI)
+ .Case("avxvnni", HasAVXVNNI)
+ .Case("avxvnniint8", HasAVXVNNIINT8)
.Case("bmi", HasBMI)
.Case("bmi2", HasBMI2)
.Case("cldemote", HasCLDEMOTE)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 7b67a4060ec3b..825087838941f 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -141,6 +141,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXFP16 = false;
bool HasCMPCCXADD = false;
bool HasRAOINT = false;
+ bool HasAVXVNNIINT8 = false;
bool HasKL = false; // For key locker
bool HasWIDEKL = false; // For wide key locker
bool HasHRESET = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 5a7f81b4ed07d..fdf3024045779 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -144,6 +144,7 @@ set(x86_files
avx512vpopcntdqvlintrin.h
avxifmaintrin.h
avxintrin.h
+ avxvnniint8intrin.h
avxvnniintrin.h
bmi2intrin.h
bmiintrin.h
diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h
new file mode 100644
index 0000000000000..b0b6cb853f713
--- /dev/null
+++ b/clang/lib/Headers/avxvnniint8intrin.h
@@ -0,0 +1,471 @@
+/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINT8INTRIN_H
+#define __AVXVNNIINT8INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
+ __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
+ __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 128-bit vector of [16 x char].
+/// \param __B
+/// A 128-bit vector of [16 x char].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 256-bit vector of [32 x char].
+/// \param __B
+/// A 256-bit vector of [32 x char].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 128-bit vector of [16 x char].
+/// \param __B
+/// A 128-bit vector of [16 x char].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 256-bit vector of [32 x char].
+/// \param __B
+/// A 256-bit vector of [32 x char].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 128-bit vector of [16 x char].
+/// \param __B
+/// A 128-bit vector of [16 x unsigned char].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 256-bit vector of [32 x char].
+/// \param __B
+/// A 256-bit vector of [32 x unsigned char].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 128-bit vector of [16 x char].
+/// \param __B
+/// A 128-bit vector of [16 x unsigned char].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 256-bit vector of [32 x char].
+/// \param __B
+/// A 256-bit vector of [32 x unsigned char].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 128-bit vector of [16 x unsigned char].
+/// \param __B
+/// A 128-bit vector of [16 x unsigned char].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+/// A 256-bit vector of [32 x unsigned char].
+/// \param __B
+/// A 256-bit vector of [32 x unsigned char].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+/// A 128-bit vector of [16 x unsigned char].
+/// \param __B
+/// A 128-bit vector of [16 x unsigned char].
+/// \returns
+/// A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
+ __m128i __A,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+/// signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in \a __W with signed saturation, and store the packed
+/// 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+/// A 256-bit vector of [32 x unsigned char].
+/// \param __B
+/// A 256-bit vector of [32 x unsigned char].
+/// \returns
+/// A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
+ (__v8si)__B);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINT8INTRIN_H
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 2e674ef6d96c3..f5884c23eedcf 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -209,6 +209,7 @@
#define bit_AVXIFMA 0x00800000
/* Features in %edx for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNIINT8 0x00000010
#define bit_PREFETCHI 0x00004000
/* Features in %eax for leaf 13 sub-leaf 1 */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 00ee91b364aeb..1204dc700c63a 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -259,6 +259,11 @@
#include <gfniintrin.h>
#endif
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVXVNNIINT8__)
+#include <avxvnniint8intrin.h>
+#endif
+
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__RDPID__)
/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index 604e3152debad..9a8a6643a6c60 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -54,9 +54,9 @@ void __attribute__((target("arch=x86-64-v4"))) x86_64_v4(void) {}
// CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686"
// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
// CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
// CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
// CHECK-NOT: tune-cpu
// CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx"
diff --git a/clang/test/CodeGen/avxvnniint8-builtins.c b/clang/test/CodeGen/avxvnniint8-builtins.c
new file mode 100644
index 0000000000000..cbdf443888a15
--- /dev/null
+++ b/clang/test/CodeGen/avxvnniint8-builtins.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=i386- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+// CHECK-LABEL: @test_mm_dpbssd_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbssd.128
+__m128i test_mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B) {
+ return _mm_dpbssd_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbssds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbssds.128
+__m128i test_mm_dpbssds_epi32(__m128i __W, __m128i __A, __m128i __B) {
+ return _mm_dpbssds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbsud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbsud.128
+__m128i test_mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B) {
+ return _mm_dpbsud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbsuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128
+__m128i test_mm_dpbsuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
+ return _mm_dpbsuds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbuud_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbuud.128
+__m128i test_mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B) {
+ return _mm_dpbuud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbuuds_epi32(
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128
+__m128i test_mm_dpbuuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
+ return _mm_dpbuuds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbssd_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbssd.256
+__m256i test_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return _mm256_dpbssd_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbssds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbssds.256
+__m256i test_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return _mm256_dpbssds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbsud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbsud.256
+__m256i test_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return _mm256_dpbsud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbsuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256
+__m256i test_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return _mm256_dpbsuds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbuud_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbuud.256
+__m256i test_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return _mm256_dpbuud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbuuds_epi32(
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256
+__m256i test_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+ return _mm256_dpbuuds_epi32(__W, __A, __B);
+}
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 0954ad135b328..2c69d7903b012 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -332,6 +332,11 @@
// AVXIFMA: "-target-feature" "+avxifma"
// NO-AVXIFMA: "-target-feature" "-avxifma"
+// RUN: %clang --target=i386 -mavxvnniint8 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX-VNNIINT8 %s
+// RUN: %clang --target=i386 -mno-avxvnniint8 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX-VNNIINT8 %s
+// AVX-VNNIINT8: "-target-feature" "+avxvnniint8"
+// NO-AVX-VNNIINT8: "-target-feature" "-avxvnniint8"
+
// RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s
// RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s
// CRC32: "-target-feature" "+crc32"
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 2db998f4ee822..46e76a3517afd 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -620,6 +620,20 @@
// NO-RAOINT-NOT: #define __RAOINT__ 1
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnniint8 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNIINT8 %s
+
+// AVXVNNIINT8: #define __AVX2__ 1
+// AVXVNNIINT8: #define __AVXVNNIINT8__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-avxvnniint8 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOAVXVNNIINT8 %s
+
+// NOAVXVNNIINT8-NOT: #define __AVXVNNIINT8__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnniint8 -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNIINT8NOAVX2 %s
+
+// AVXVNNIINT8NOAVX2-NOT: #define __AVX2__ 1
+// AVXVNNIINT8NOAVX2-NOT: #define __AVXVNNIINT8__ 1
+
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mcrc32 -x c -E -dM -o - %s | FileCheck -check-prefix=CRC32 %s
// CRC32: #define __CRC32__ 1
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index d892d92297e4b..556f7fc7f5449 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -141,6 +141,7 @@ Changes to the X86 Backend
* Add support for the ``WRMSRNS`` instruction.
* Support ISA of ``AMX-FP16`` which contains ``tdpfp16ps`` instruction.
* Support ISA of ``CMPCCXADD``.
+* Support ISA of ``AVX-VNNI-INT8``.
Changes to the OCaml bindings
-----------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index d45be12cda877..bc0e4106a410b 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1929,6 +1929,66 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
ClangBuiltin<"__builtin_ia32_vpdpwssds512">,
Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
llvm_v16i32_ty], [IntrNoMem]>;
+ def int_x86_avx2_vpdpbssd_128
+ : ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
+ Intrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbssd_256
+ : ClangBuiltin<"__builtin_ia32_vpdpbssd256">,
+ Intrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbssds_128
+ : ClangBuiltin<"__builtin_ia32_vpdpbssds128">,
+ Intrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbssds_256
+ : ClangBuiltin<"__builtin_ia32_vpdpbssds256">,
+ Intrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbsud_128
+ : ClangBuiltin<"__builtin_ia32_vpdpbsud128">,
+ Intrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbsud_256
+ : ClangBuiltin<"__builtin_ia32_vpdpbsud256">,
+ Intrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbsuds_128
+ : ClangBuiltin<"__builtin_ia32_vpdpbsuds128">,
+ Intrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbsuds_256
+ : ClangBuiltin<"__builtin_ia32_vpdpbsuds256">,
+ Intrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbuud_128
+ : ClangBuiltin<"__builtin_ia32_vpdpbuud128">,
+ Intrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbuud_256
+ : ClangBuiltin<"__builtin_ia32_vpdpbuud256">,
+ Intrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbuuds_128
+ : ClangBuiltin<"__builtin_ia32_vpdpbuuds128">,
+ Intrinsic<[llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_vpdpbuuds_256
+ : ClangBuiltin<"__builtin_ia32_vpdpbuuds256">,
+ Intrinsic<[llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def
index e8e50acf1771b..2c656e19d0d19 100644
--- a/llvm/include/llvm/Support/X86TargetParser.def
+++ b/llvm/include/llvm/Support/X86TargetParser.def
@@ -207,6 +207,7 @@ X86_FEATURE (AMX_FP16, "amx-fp16")
X86_FEATURE (CMPCCXADD, "cmpccxadd")
X86_FEATURE (AVXVNNI, "avxvnni")
X86_FEATURE (AVXIFMA, "avxifma")
+X86_FEATURE (AVXVNNIINT8, "avxvnniint8")
// These features aren't really CPU features, but the frontend can set them.
X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk")
X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index 96663c8d40e43..d1f01fce62a15 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -1812,6 +1812,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
Features["cmpccxadd"] = HasLeaf7Subleaf1 && ((EAX >> 7) & 1);
Features["hreset"] = HasLeaf7Subleaf1 && ((EAX >> 22) & 1);
Features["avxifma"] = HasLeaf7Subleaf1 && ((EAX >> 23) & 1) && HasAVXSave;
+ Features["avxvnniint8"] = HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave;
Features["prefetchi"] = HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
bool HasLeafD = MaxLevel >= 0xd &&
diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
index 2b0a3bb881201..e6074b1e904ec 100644
--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -582,6 +582,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesHRESET = {};
+constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT8 = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {};
constexpr FeatureBitset ImpliedFeaturesCMPCCXADD = {};
constexpr FeatureBitset ImpliedFeaturesRAOINT = {};
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 0f0cf69ed012e..a860352acad62 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -187,6 +187,10 @@ def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect",
def FeatureFP16 : SubtargetFeature<"avx512fp16", "HasFP16", "true",
"Support 16-bit floating point",
[FeatureBWI, FeatureVLX, FeatureDQI]>;
+def FeatureAVXVNNIINT8 : SubtargetFeature<"avxvnniint8",
+ "HasAVXVNNIINT8", "true",
+ "Enable AVX-VNNI-INT8",
+ [FeatureAVX2]>;
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 181171272cce3..9fd07a7301bb3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34221,6 +34221,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ENQCMD)
NODE_NAME_CASE(ENQCMDS)
NODE_NAME_CASE(VP2INTERSECT)
+ NODE_NAME_CASE(VPDPBSUD)
+ NODE_NAME_CASE(VPDPBSUDS)
+ NODE_NAME_CASE(VPDPBUUD)
+ NODE_NAME_CASE(VPDPBUUDS)
+ NODE_NAME_CASE(VPDPBSSD)
+ NODE_NAME_CASE(VPDPBSSDS)
NODE_NAME_CASE(AESENC128KL)
NODE_NAME_CASE(AESDEC128KL)
NODE_NAME_CASE(AESENC256KL)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 50d9bbf613ff5..ba5f31e03777a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -584,6 +584,13 @@ namespace llvm {
VFCMULCSH,
VFCMULCSH_RND,
+ VPDPBSUD,
+ VPDPBSUDS,
+ VPDPBUUD,
+ VPDPBUUDS,
+ VPDPBSSD,
+ VPDPBSSDS,
+
// Compress and expand.
COMPRESS,
EXPAND,
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 9d58889814037..f4e24329ff014 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -3995,6 +3995,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 },
{ X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 },
{ X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 },
+ { X86::VPDPBSSDSYrr, X86::VPDPBSSDSYrm, 0 },
+ { X86::VPDPBSSDSrr, X86::VPDPBSSDSrm, 0 },
+ { X86::VPDPBSSDYrr, X86::VPDPBSSDYrm, 0 },
+ { X86::VPDPBSSDrr, X86::VPDPBSSDrm, 0 },
+ { X86::VPDPBSUDSYrr, X86::VPDPBSUDSYrm, 0 },
+ { X86::VPDPBSUDSrr, X86::VPDPBSUDSrm, 0 },
+ { X86::VPDPBSUDYrr, X86::VPDPBSUDYrm, 0 },
+ { X86::VPDPBSUDrr, X86::VPDPBSUDrm, 0 },
{ X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 },
{ X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 },
{ X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 },
@@ -4005,6 +4013,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 },
{ X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 },
{ X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 },
+ { X86::VPDPBUUDSYrr, X86::VPDPBUUDSYrm, 0 },
+ { X86::VPDPBUUDSrr, X86::VPDPBUUDSrm, 0 },
+ { X86::VPDPBUUDYrr, X86::VPDPBUUDYrm, 0 },
+ { X86::VPDPBUUDrr, X86::VPDPBUUDrm, 0 },
{ X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 },
{ X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 },
{ X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 },
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 65c1adc38d7dc..774f7e92ebb31 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -813,6 +813,13 @@ def SDTX86MaskedStore: SDTypeProfile<0, 3, [ // masked store
SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
]>;
+def X86vpdpbssd : SDNode<"X86ISD::VPDPBSSD", SDTVnni>;
+def X86vpdpbssds : SDNode<"X86ISD::VPDPBSSDS", SDTVnni>;
+def X86vpdpbsud : SDNode<"X86ISD::VPDPBSUD", SDTVnni>;
+def X86vpdpbsuds : SDNode<"X86ISD::VPDPBSUDS", SDTVnni>;
+def X86vpdpbuud : SDNode<"X86ISD::VPDPBUUD", SDTVnni>;
+def X86vpdpbuuds : SDNode<"X86ISD::VPDPBUUDS", SDTVnni>;
+
//===----------------------------------------------------------------------===//
// SSE pattern fragments
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 4f17635b42df1..4668a35c76cb3 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2555,6 +2555,14 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case X86::VPDPWSSDrr:
case X86::VPDPWSSDSYrr:
case X86::VPDPWSSDSrr:
+ case X86::VPDPBSSDSrr:
+ case X86::VPDPBSSDSYrr:
+ case X86::VPDPBSSDrr:
+ case X86::VPDPBSSDYrr:
+ case X86::VPDPBUUDSrr:
+ case X86::VPDPBUUDSYrr:
+ case X86::VPDPBUUDrr:
+ case X86::VPDPBUUDYrr:
case X86::VPDPWSSDZ128r:
case X86::VPDPWSSDZ128rk:
case X86::VPDPWSSDZ128rkz:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 8e038f4c00a30..9ce2afec869b4 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -921,6 +921,7 @@ def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
def HasBF16 : Predicate<"Subtarget->hasBF16()">;
def HasFP16 : Predicate<"Subtarget->hasFP16()">;
+def HasAVXVNNIINT8 : Predicate<"Subtarget->hasAVXVNNIINT8()">;
def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index a34eeb60f7ed5..99d575b570bb1 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8160,3 +8160,62 @@ multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, VEX_W, ExplicitVEXPrefix;
defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, VEX_W, ExplicitVEXPrefix;
+
+let Constraints = "$src1 = $dst" in
+multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
+ RegisterClass RC, PatFrag MemOpFrag,
+ X86MemOperand X86memop, SDNode OpNode,
+ X86FoldableSchedWrite Sched,
+ bit IsCommutable> {
+ let isCommutable = IsCommutable in
+ def rr : I<Opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
+ VEX_4V, Sched<[Sched]>;
+ def rm : I<Opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, X86memop:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
+ (MemOpFrag addr:$src3))))]>,
+ VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVXVNNIINT8] in {
+ defm VPDPBSSD : avx_dotprod_rm<0x50,"vpdpbssd", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
+ 1>, T8XD;
+ defm VPDPBSSDY : avx_dotprod_rm<0x50,"vpdpbssd", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
+ 1>, VEX_L, T8XD;
+ defm VPDPBUUD : avx_dotprod_rm<0x50,"vpdpbuud", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
+ 1>, T8PS;
+ defm VPDPBUUDY : avx_dotprod_rm<0x50,"vpdpbuud", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
+ 1>, VEX_L, T8PS;
+ defm VPDPBSSDS : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
+ 1>, T8XD;
+ defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
+ 1>, VEX_L, T8XD;
+ defm VPDPBUUDS : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
+ 1>, T8PS;
+ defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
+ 1>, VEX_L, T8PS;
+ defm VPDPBSUD : avx_dotprod_rm<0x50,"vpdpbsud", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpbsud, SchedWriteVecIMul.XMM,
+ 0>, T8XS;
+ defm VPDPBSUDY : avx_dotprod_rm<0x50,"vpdpbsud", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpbsud, SchedWriteVecIMul.YMM,
+ 0>, VEX_L, T8XS;
+ defm VPDPBSUDS : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
+ i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
+ 0>, T8XS;
+ defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
+ i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
+ 0>, VEX_L, T8XS;
+}
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 6112c0b7d6c3e..3bb2f07b5f1a1 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -415,6 +415,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbssd_128, INTR_TYPE_3OP, X86ISD::VPDPBSSD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbssd_256, INTR_TYPE_3OP, X86ISD::VPDPBSSD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbssds_128, INTR_TYPE_3OP, X86ISD::VPDPBSSDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbssds_256, INTR_TYPE_3OP, X86ISD::VPDPBSSDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbsud_128, INTR_TYPE_3OP, X86ISD::VPDPBSUD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbsud_256, INTR_TYPE_3OP, X86ISD::VPDPBSUD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbsuds_128, INTR_TYPE_3OP, X86ISD::VPDPBSUDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbsuds_256, INTR_TYPE_3OP, X86ISD::VPDPBSUDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbuud_128, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbuud_256, INTR_TYPE_3OP, X86ISD::VPDPBUUD, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbuuds_128, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
+ X86_INTRINSIC_DATA(avx2_vpdpbuuds_256, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
new file mode 100644
index 0000000000000..5c17079519116
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
@@ -0,0 +1,316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X64
+
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT: vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x18]
+; X86-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT: vpdpbssd (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x1f]
+; X64-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %res = add <4 x i32> %1, %2
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT: vpdpbssds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0x18]
+; X86-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
+; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT: vpdpbssds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0x1f]
+; X64-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
+; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %res = add <4 x i32> %1, %2
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT: vpdpbssd (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0x18]
+; X86-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
+; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT: vpdpbssd (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0x1f]
+; X64-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
+; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %res = add <8 x i32> %1, %2
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT: vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x18]
+; X86-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT: vpdpbssds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x1f]
+; X64-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %res = add <8 x i32> %1, %2
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT: vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x18]
+; X86-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT: vpdpbsud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x1f]
+; X64-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %res = add <4 x i32> %1, %2
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT: vpdpbsuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0x18]
+; X86-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
+; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT: vpdpbsuds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0x1f]
+; X64-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
+; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %res = add <4 x i32> %1, %2
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT: vpdpbsud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0x18]
+; X86-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
+; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT: vpdpbsud (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0x1f]
+; X64-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
+; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %res = add <8 x i32> %1, %2
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT: vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x18]
+; X86-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT: vpdpbsuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x1f]
+; X64-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %res = add <8 x i32> %1, %2
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT: vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x18]
+; X86-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT: vpdpbuud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x1f]
+; X64-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %res = add <4 x i32> %1, %2
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT: vpdpbuuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0x18]
+; X86-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
+; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT: vpdpbuuds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0x1f]
+; X64-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
+; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %res = add <4 x i32> %1, %2
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT: vpdpbuud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0x18]
+; X86-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
+; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT: vpdpbuud (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0x1f]
+; X64-NEXT: vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
+; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %res = add <8 x i32> %1, %2
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT: vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x18]
+; X86-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT: vpdpbuuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x1f]
+; X64-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %res = add <8 x i32> %1, %2
+ ret <8 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
new file mode 100644
index 0000000000000..fd988f7d318fe
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 < %s | FileCheck %s
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssd_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssd_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssds_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssds_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsud:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsud_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT: vpdpbsud %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsud_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsud_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT: vpdpbsud %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsuds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsuds_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT: vpdpbsuds %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsuds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsuds_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT: vpdpbsuds %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuud:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuud_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuud_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuud_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuuds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuuds_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuuds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuuds_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt
new file mode 100644
index 0000000000000..63218ef55e57f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt
@@ -0,0 +1,243 @@
+# RUN: llvm-mc --disassemble %s -triple=i686 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: vpdpbssd %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x50,0xd4
+
+# ATT: vpdpbssd %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x50,0xd4
+
+# ATT: vpdpbssd 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%eax), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x67,0x50,0x10
+
+# ATT: vpdpbssd -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbssd 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%eax), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x63,0x50,0x10
+
+# ATT: vpdpbssd -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbssds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x51,0xd4
+
+# ATT: vpdpbssds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x51,0xd4
+
+# ATT: vpdpbssds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%eax), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x67,0x51,0x10
+
+# ATT: vpdpbssds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbssds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%eax), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x63,0x51,0x10
+
+# ATT: vpdpbssds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbsud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x50,0xd4
+
+# ATT: vpdpbsud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x50,0xd4
+
+# ATT: vpdpbsud 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%eax), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0x50,0x10
+
+# ATT: vpdpbsud -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbsud 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%eax), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0x50,0x10
+
+# ATT: vpdpbsud -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbsuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x51,0xd4
+
+# ATT: vpdpbsuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x51,0xd4
+
+# ATT: vpdpbsuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%eax), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0x51,0x10
+
+# ATT: vpdpbsuds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbsuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%eax), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0x51,0x10
+
+# ATT: vpdpbsuds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbuud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x50,0xd4
+
+# ATT: vpdpbuud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x50,0xd4
+
+# ATT: vpdpbuud 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%eax), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0x50,0x10
+
+# ATT: vpdpbuud -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbuud 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%eax), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0x50,0x10
+
+# ATT: vpdpbuud -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbuuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x51,0xd4
+
+# ATT: vpdpbuuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x51,0xd4
+
+# ATT: vpdpbuuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%eax), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0x51,0x10
+
+# ATT: vpdpbuuds -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbuuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%eax), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0x51,0x10
+
+# ATT: vpdpbuuds -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt
new file mode 100644
index 0000000000000..9b30275e61ad9
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt
@@ -0,0 +1,243 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: vpdpbssd %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymm14
+0xc4,0x42,0x17,0x50,0xe6
+
+# ATT: vpdpbssd %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmm14
+0xc4,0x42,0x13,0x50,0xe6
+
+# ATT: vpdpbssd 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%rip), %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssd -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbssd 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssd 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssd (%rip), %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssd -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbssds %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymm14
+0xc4,0x42,0x17,0x51,0xe6
+
+# ATT: vpdpbssds %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmm14
+0xc4,0x42,0x13,0x51,0xe6
+
+# ATT: vpdpbssds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%rip), %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssds -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbssds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbssds 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbssds (%rip), %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbssds -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbsud %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymm14
+0xc4,0x42,0x16,0x50,0xe6
+
+# ATT: vpdpbsud %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmm14
+0xc4,0x42,0x12,0x50,0xe6
+
+# ATT: vpdpbsud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%rip), %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsud -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbsud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsud 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsud (%rip), %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsud -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbsuds %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymm14
+0xc4,0x42,0x16,0x51,0xe6
+
+# ATT: vpdpbsuds %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmm14
+0xc4,0x42,0x12,0x51,0xe6
+
+# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%rip), %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsuds -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbsuds 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbsuds (%rip), %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbsuds -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbuud %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymm14
+0xc4,0x42,0x14,0x50,0xe6
+
+# ATT: vpdpbuud %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmm14
+0xc4,0x42,0x10,0x50,0xe6
+
+# ATT: vpdpbuud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%rip), %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuud -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbuud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuud 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuud (%rip), %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuud -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: vpdpbuuds %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymm14
+0xc4,0x42,0x14,0x51,0xe6
+
+# ATT: vpdpbuuds %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmm14
+0xc4,0x42,0x10,0x51,0xe6
+
+# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%rip), %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuuds -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: vpdpbuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: vpdpbuuds 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: vpdpbuuds (%rip), %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT: vpdpbuuds -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff
+
diff --git a/llvm/test/MC/X86/avx_vnni_int8-32-att.s b/llvm/test/MC/X86/avx_vnni_int8-32-att.s
new file mode 100644
index 0000000000000..3ade562079eb3
--- /dev/null
+++ b/llvm/test/MC/X86/avx_vnni_int8-32-att.s
@@ -0,0 +1,241 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnniint8 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpbssd %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4]
+ vpdpbssd %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbssd %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4]
+ vpdpbssd %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbssd 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssd 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbssd 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssd 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbssd (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x10]
+ vpdpbssd (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbssd -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssd -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbssd 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssd 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbssd 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssd 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbssd (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x10]
+ vpdpbssd (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbssd -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssd -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbssds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4]
+ vpdpbssds %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbssds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4]
+ vpdpbssds %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbssds 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssds 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbssds 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssds 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbssds (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x10]
+ vpdpbssds (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbssds -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssds -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbssds 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssds 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbssds 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssds 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbssds (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x10]
+ vpdpbssds (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbssds -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssds -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbsud %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4]
+ vpdpbsud %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbsud %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4]
+ vpdpbsud %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbsud 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsud 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbsud 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsud 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbsud (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x10]
+ vpdpbsud (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbsud -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsud -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbsud 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsud 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbsud 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsud 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbsud (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x10]
+ vpdpbsud (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbsud -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsud -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbsuds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4]
+ vpdpbsuds %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbsuds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4]
+ vpdpbsuds %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbsuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbsuds 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsuds 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbsuds (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x10]
+ vpdpbsuds (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbsuds -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsuds -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbsuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbsuds 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsuds 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbsuds (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x10]
+ vpdpbsuds (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbsuds -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsuds -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbuud %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4]
+ vpdpbuud %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbuud %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4]
+ vpdpbuud %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbuud 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuud 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbuud 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuud 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbuud (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x10]
+ vpdpbuud (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbuud -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuud -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbuud 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuud 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbuud 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuud 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbuud (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x10]
+ vpdpbuud (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbuud -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuud -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbuuds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4]
+ vpdpbuuds %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbuuds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4]
+ vpdpbuuds %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbuuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuuds 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbuuds 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuuds 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbuuds (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x10]
+ vpdpbuuds (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbuuds -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuuds -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbuuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuuds 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbuuds 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuuds 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbuuds (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x10]
+ vpdpbuuds (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbuuds -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuuds -512(,%ebp,2), %xmm3, %xmm2
diff --git a/llvm/test/MC/X86/avx_vnni_int8-32-intel.s b/llvm/test/MC/X86/avx_vnni_int8-32-intel.s
new file mode 100644
index 0000000000000..aec8ff1c9e5da
--- /dev/null
+++ b/llvm/test/MC/X86/avx_vnni_int8-32-intel.s
@@ -0,0 +1,242 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnniint8 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpbssd ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4]
+ vpdpbssd ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssd xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4]
+ vpdpbssd xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x10]
+ vpdpbssd ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x10]
+ vpdpbssd xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4]
+ vpdpbssds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4]
+ vpdpbssds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x10]
+ vpdpbssds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x10]
+ vpdpbssds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4]
+ vpdpbsud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4]
+ vpdpbsud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x10]
+ vpdpbsud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x10]
+ vpdpbsud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4]
+ vpdpbsuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4]
+ vpdpbsuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x10]
+ vpdpbsuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x10]
+ vpdpbsuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4]
+ vpdpbuud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4]
+ vpdpbuud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x10]
+ vpdpbuud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x10]
+ vpdpbuud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4]
+ vpdpbuuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4]
+ vpdpbuuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x10]
+ vpdpbuuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+ vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+ vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x10]
+ vpdpbuuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
diff --git a/llvm/test/MC/X86/avx_vnni_int8-64-att.s b/llvm/test/MC/X86/avx_vnni_int8-64-att.s
new file mode 100644
index 0000000000000..92b490a9be8ef
--- /dev/null
+++ b/llvm/test/MC/X86/avx_vnni_int8-64-att.s
@@ -0,0 +1,242 @@
+// RUN: llvm-mc -triple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-encoding < %s | FileCheck %s
+
+// CHECK: vpdpbssd %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xe6]
+ vpdpbssd %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbssd %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xe6]
+ vpdpbssd %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssd 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbssd 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssd 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbssd (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbssd (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbssd -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssd -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbssd 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssd 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbssd 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssd 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbssd (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbssd (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbssd -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssd -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbssds %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xe6]
+ vpdpbssds %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbssds %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xe6]
+ vpdpbssds %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbssds 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssds 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbssds (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbssds (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbssds -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssds -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbssds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbssds 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssds 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbssds (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbssds (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbssds -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssds -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbsud %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xe6]
+ vpdpbsud %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbsud %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xe6]
+ vpdpbsud %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbsud 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsud 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbsud (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbsud (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbsud -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsud -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbsud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbsud 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsud 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbsud (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbsud (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbsud -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsud -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xe6]
+ vpdpbsuds %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbsuds %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xe6]
+ vpdpbsuds %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsuds 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbsuds (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsuds -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsuds 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbsuds (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsuds -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbuud %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xe6]
+ vpdpbuud %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbuud %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xe6]
+ vpdpbuud %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuud 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbuud 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuud 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbuud (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbuud (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbuud -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuud -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbuud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuud 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbuud 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuud 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbuud (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbuud (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbuud -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuud -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xe6]
+ vpdpbuuds %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbuuds %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xe6]
+ vpdpbuuds %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuuds 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuuds 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbuuds (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuuds -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuuds 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuuds 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbuuds (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuuds -512(,%rbp,2), %xmm13, %xmm12
+
diff --git a/llvm/test/MC/X86/avx_vnni_int8-64-intel.s b/llvm/test/MC/X86/avx_vnni_int8-64-intel.s
new file mode 100644
index 0000000000000..0d5b066e710f2
--- /dev/null
+++ b/llvm/test/MC/X86/avx_vnni_int8-64-intel.s
@@ -0,0 +1,242 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnniint8 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpbssd ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xe6]
+ vpdpbssd ymm12, ymm13, ymm14
+
+// CHECK: vpdpbssd xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xe6]
+ vpdpbssd xmm12, xmm13, xmm14
+
+// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbssd ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbssd xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbssds ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xe6]
+ vpdpbssds ymm12, ymm13, ymm14
+
+// CHECK: vpdpbssds xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xe6]
+ vpdpbssds xmm12, xmm13, xmm14
+
+// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbssds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbssds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbssds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbssds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbssds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbssds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbsud ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xe6]
+ vpdpbsud ymm12, ymm13, ymm14
+
+// CHECK: vpdpbsud xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xe6]
+ vpdpbsud xmm12, xmm13, xmm14
+
+// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbsud ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbsud xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xe6]
+ vpdpbsuds ymm12, ymm13, ymm14
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xe6]
+ vpdpbsuds xmm12, xmm13, xmm14
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbsuds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbsuds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbuud ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xe6]
+ vpdpbuud ymm12, ymm13, ymm14
+
+// CHECK: vpdpbuud xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xe6]
+ vpdpbuud xmm12, xmm13, xmm14
+
+// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbuud ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00]
+ vpdpbuud xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xe6]
+ vpdpbuuds ymm12, ymm13, ymm14
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xe6]
+ vpdpbuuds xmm12, xmm13, xmm14
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbuuds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ vpdpbuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ vpdpbuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+ vpdpbuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00]
+ vpdpbuuds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ vpdpbuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
More information about the cfe-commits
mailing list