[clang] 756f597 - [X86] Support Intel avxvnni

via cfe-commits cfe-commits at lists.llvm.org
Fri Oct 30 22:18:35 PDT 2020


Author: Liu, Chen3
Date: 2020-10-31T12:39:51+08:00
New Revision: 756f5978410809530150f5e1cd425e85ad94d1cd

URL: https://github.com/llvm/llvm-project/commit/756f5978410809530150f5e1cd425e85ad94d1cd
DIFF: https://github.com/llvm/llvm-project/commit/756f5978410809530150f5e1cd425e85ad94d1cd.diff

LOG: [X86] Support Intel avxvnni
This patch mainly made the following changes:

1. Support AVX-VNNI instructions;
2. Introduce ExplicitVEXPrefix flag so that vpdpbusd/vpdpbusds/vpdpbusds/vpdpbusds instructions only use vex-encoding when user explicity add {vex} prefix.

Differential Revision: https://reviews.llvm.org/D89105

Added: 
    clang/lib/Headers/avxvnniintrin.h
    clang/test/CodeGen/X86/avxvnni-builtins.c
    llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
    llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
    llvm/test/MC/Disassembler/X86/avx_vnni.txt
    llvm/test/MC/Disassembler/X86/intel-syntax-avx_vnni.txt
    llvm/test/MC/Disassembler/X86/intel-syntax-x86-64-avx_vnni.txt
    llvm/test/MC/Disassembler/X86/x86-64-avx_vnni.txt
    llvm/test/MC/X86/avx_vnni-encoding.s
    llvm/test/MC/X86/intel-syntax-avx_vnni.s
    llvm/test/MC/X86/intel-syntax-x86-64-avx_vnni.s
    llvm/test/MC/X86/x86-64-avx_vnni-encoding.s

Modified: 
    clang/docs/ClangCommandLineReference.rst
    clang/docs/ReleaseNotes.rst
    clang/include/clang/Basic/BuiltinsX86.def
    clang/include/clang/Driver/Options.td
    clang/lib/Basic/Targets/X86.cpp
    clang/lib/Basic/Targets/X86.h
    clang/lib/Headers/CMakeLists.txt
    clang/lib/Headers/avx512vlvnniintrin.h
    clang/lib/Headers/cpuid.h
    clang/lib/Headers/immintrin.h
    clang/test/CodeGen/attr-target-x86.c
    clang/test/Driver/x86-target-features.c
    clang/test/Preprocessor/predefined-arch-macros.c
    clang/test/Preprocessor/x86_target_features.c
    llvm/docs/ReleaseNotes.rst
    llvm/include/llvm/Support/X86TargetParser.def
    llvm/lib/Support/Host.cpp
    llvm/lib/Support/X86TargetParser.cpp
    llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
    llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
    llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
    llvm/lib/Target/X86/X86.td
    llvm/lib/Target/X86/X86EvexToVex.cpp
    llvm/lib/Target/X86/X86InstrFoldTables.cpp
    llvm/lib/Target/X86/X86InstrFormats.td
    llvm/lib/Target/X86/X86InstrInfo.cpp
    llvm/lib/Target/X86/X86InstrInfo.td
    llvm/lib/Target/X86/X86InstrSSE.td
    llvm/lib/Target/X86/X86Subtarget.h

Removed: 
    


################################################################################
diff  --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst
index 0dca391cf0b5..830d3aed0904 100644
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -3253,6 +3253,8 @@ X86
 
 .. option:: -mavx512vpopcntdq, -mno-avx512vpopcntdq
 
+.. option:: -mavxvnni, -mno-avxvnni
+
 .. option:: -mbmi, -mno-bmi
 
 .. option:: -mbmi2, -mno-bmi2

diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2a7beba73b69..17c16d729b76 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -205,6 +205,8 @@ X86 Support in Clang
 
 - Support for ``UINTR`` instructions has been added.
 
+- Support for ``AVXVNNI`` instructions has been added.
+
 Internal API Changes
 --------------------
 

diff  --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 6bfb9b2cf8a5..0f5594f1a4e6 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -960,17 +960,17 @@ TARGET_BUILTIN(__builtin_ia32_alignq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl
 TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f")
 
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpbusds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpwssd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
 
 TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 069e5805d999..0f7440896cb4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3235,6 +3235,8 @@ def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Gro
 def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
 def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Features_Group>;
 def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
+def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
+def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group<m_x86_Features_Group>;
 def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
 def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
 def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;

diff  --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 4ce3a5d0327f..694a8095e336 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -306,6 +306,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAMXINT8 = true;
     } else if (Feature == "+amx-tile") {
       HasAMXTILE = true;
+    } else if (Feature == "+avxvnni") {
+      HasAVXVNNI = true;
     } else if (Feature == "+serialize") {
       HasSERIALIZE = true;
     } else if (Feature == "+tsxldtrk") {
@@ -728,6 +730,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AMXINT8__");
   if (HasAMXBF16)
     Builder.defineMacro("__AMXBF16__");
+  if (HasAVXVNNI)
+    Builder.defineMacro("__AVXVNNI__");
   if (HasSERIALIZE)
     Builder.defineMacro("__SERIALIZE__");
   if (HasTSXLDTRK)
@@ -846,6 +850,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("avx512vbmi2", true)
       .Case("avx512ifma", true)
       .Case("avx512vp2intersect", true)
+      .Case("avxvnni", true)
       .Case("bmi", true)
       .Case("bmi2", true)
       .Case("cldemote", true)
@@ -918,6 +923,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("amx-bf16", HasAMXBF16)
       .Case("amx-int8", HasAMXINT8)
       .Case("amx-tile", HasAMXTILE)
+      .Case("avxvnni", HasAVXVNNI)
       .Case("avx", SSELevel >= AVX)
       .Case("avx2", SSELevel >= AVX2)
       .Case("avx512f", SSELevel >= AVX512F)

diff  --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 5f9338580857..9979460a55bf 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -130,6 +130,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasKL = false;      // For key locker
   bool HasWIDEKL = false; // For wide key locker
   bool HasHRESET = false;
+  bool HasAVXVNNI = false;
   bool HasAMXTILE = false;
   bool HasAMXINT8 = false;
   bool HasAMXBF16 = false;

diff  --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index f9b8bfc21160..b2c0ce8dd4a0 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -35,6 +35,7 @@ set(files
   avx512vnniintrin.h
   avx512vlvnniintrin.h
   avxintrin.h
+  avxvnniintrin.h
   bmi2intrin.h
   bmiintrin.h
   __clang_cuda_builtin_vars.h

diff  --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index b7c8fa08c653..71ac1b4370d4 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -18,13 +18,157 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusd_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
-                                             (__v8si)__B);
-}
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusds_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssd_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssds_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusd_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusds_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssd_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssds_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
@@ -42,13 +186,6 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@@ -65,13 +202,6 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@@ -88,13 +218,6 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                      (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@@ -111,13 +234,6 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@@ -134,13 +250,6 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                         (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@@ -157,13 +266,6 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                        (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@@ -180,13 +282,6 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                         (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {

diff  --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h
new file mode 100644
index 000000000000..ad45cb7962e5
--- /dev/null
+++ b/clang/lib/Headers/avxvnniintrin.h
@@ -0,0 +1,225 @@
+/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINTRIN_H
+#define __AVXVNNIINTRIN_H
+
+/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
+/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+
+/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINTRIN_H

diff  --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 6ed7bd04ca0b..c903c3adf981 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -196,6 +196,7 @@
 #define bit_AMXINT8       0x02000000
 
 /* Features in %eax for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNI       0x00000008
 #define bit_AVX512BF16    0x00000020
 #define bit_HRESET        0x00400000
 

diff  --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index c31d2aaa0d52..22f7a520c929 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -145,6 +145,11 @@
 #include <avx512vlvnniintrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXVNNI__)
+#include <avxvnniintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__AVX512DQ__)
 #include <avx512dqintrin.h>

diff  --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c
new file mode 100644
index 000000000000..1e9bb091c68d
--- /dev/null
+++ b/clang/test/CodeGen/X86/avxvnni-builtins.c
@@ -0,0 +1,99 @@
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpbusd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.256
+  return _mm256_dpbusd_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpbusds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.256
+  return _mm256_dpbusds_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpwssd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.256
+  return _mm256_dpwssd_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpwssds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.256
+  return _mm256_dpwssds_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.128
+  return _mm_dpbusd_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.128
+  return _mm_dpbusds_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.128
+  return _mm_dpwssd_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.128
+  return _mm_dpwssds_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpbusd_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.256
+  return _mm256_dpbusd_avx_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpbusds_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.256
+  return _mm256_dpbusds_avx_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpwssd_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.256
+  return _mm256_dpwssd_avx_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpwssds_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.256
+  return _mm256_dpwssds_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusd_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.128
+  return _mm_dpbusd_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusds_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.128
+  return _mm_dpbusds_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssd_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.128
+  return _mm_dpwssd_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssds_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.128
+  return _mm_dpwssds_avx_epi32(__S, __A, __B);
+}

diff  --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index 99cca0b4d9dd..af2a8a0ef518 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -54,9 +54,9 @@ void __attribute__((target("arch=x86-64-v4"))) x86_64_v4() {}
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686"
 // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
 // CHECK-NOT: tune-cpu
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx"

diff  --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index cd79129a887e..b60f4566a43f 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -288,3 +288,8 @@
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-UINTR %s
 // UINTR: "-target-feature" "+uintr"
 // NO-UINTR: "-target-feature" "-uintr"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=AVX-VNNI %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AVX-VNNI %s
+// AVX-VNNI: "-target-feature" "+avxvnni"
+// NO-AVX-VNNI: "-target-feature" "-avxvnni"

diff  --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index f469b9aded64..052fb3c1bbf3 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -1654,6 +1654,7 @@
 // CHECK_SPR_M32: #define __AVX512VL__ 1
 // CHECK_SPR_M32: #define __AVX512VNNI__ 1
 // CHECK_SPR_M32: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_SPR_M32: #define __AVXVNNI__ 1
 // CHECK_SPR_M32: #define __AVX__ 1
 // CHECK_SPR_M32: #define __BMI2__ 1
 // CHECK_SPR_M32: #define __BMI__ 1
@@ -1724,6 +1725,7 @@
 // CHECK_SPR_M64: #define __AVX512VL__ 1
 // CHECK_SPR_M64: #define __AVX512VNNI__ 1
 // CHECK_SPR_M64: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_SPR_M64: #define __AVXVNNI__ 1
 // CHECK_SPR_M64: #define __AVX__ 1
 // CHECK_SPR_M64: #define __BMI2__ 1
 // CHECK_SPR_M64: #define __BMI__ 1
@@ -1782,6 +1784,7 @@
 // CHECK_ADL_M32: #define __AES__ 1
 // CHECK_ADL_M32: #define __AVX2__ 1
 // CHECK_ADL_M32-NOT: AVX512
+// CHECK_ADL_M32: #define __AVXVNNI__ 1
 // CHECK_ADL_M32: #define __AVX__ 1
 // CHECK_ADL_M32: #define __BMI2__ 1
 // CHECK_ADL_M32: #define __BMI__ 1
@@ -1822,6 +1825,7 @@
 // CHECK_ADL_M64: #define __AES__ 1
 // CHECK_ADL_M64: #define __AVX2__ 1
 // CHECK_ADL_M64-NOT: AVX512
+// CHECK_ADL_M64: #define __AVXVNNI__ 1
 // CHECK_ADL_M64: #define __AVX__ 1
 // CHECK_ADL_M64: #define __BMI2__ 1
 // CHECK_ADL_M64: #define __BMI__ 1

diff  --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 69078c2ff2be..69a15ae1eaf1 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -544,3 +544,17 @@
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr -x c -E -dM -o - %s | FileCheck -check-prefix=NOUINTR %s
 
 // NOUINTR-NOT: #define __UINTR__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s
+
+// AVXVNNI: #define __AVX2__ 1
+// AVXVNNI: #define __AVXVNNI__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-avxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOAVXVNNI %s
+
+// NOAVXVNNI-NOT: #define __AVXVNNI__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNINOAVX2 %s
+
+// AVXVNNINOAVX2-NOT: #define __AVX2__ 1
+// AVXVNNINOAVX2-NOT: #define __AVXVNNI__ 1

diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index e4329d0aac60..a5117493bc4e 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -121,6 +121,7 @@ During this release ...
   the target CPU.
 * Support for ``HRESET`` instructions has been added.
 * Support for ``UINTR`` instructions has been added.
+* Support for ``AVXVNNI`` instructions has been added.
 
 Changes to the AMDGPU Target
 -----------------------------

diff  --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def
index 3d7a7756af8a..c0fe76dfdd64 100644
--- a/llvm/include/llvm/Support/X86TargetParser.def
+++ b/llvm/include/llvm/Support/X86TargetParser.def
@@ -190,6 +190,7 @@ X86_FEATURE       (XSAVEC,          "xsavec")
 X86_FEATURE       (XSAVEOPT,        "xsaveopt")
 X86_FEATURE       (XSAVES,          "xsaves")
 X86_FEATURE       (HRESET,          "hreset")
+X86_FEATURE       (AVXVNNI,         "avxvnni")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")

diff  --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index b41717d96a12..f8413d703358 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -1497,6 +1497,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["amx-int8"]   = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
   bool HasLeaf7Subleaf1 =
       MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+  Features["avxvnni"]    = HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave;
   Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save;
   Features["hreset"]     = HasLeaf7Subleaf1 && ((EAX >> 22) & 1);
 

diff  --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
index 39069b03da9b..131bfd5eb6ac 100644
--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -205,10 +205,10 @@ constexpr FeatureBitset FeaturesSapphireRapids =
     FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE |
     FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE |
     FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR |
-    FeatureWAITPKG;
+    FeatureWAITPKG | FeatureAVXVNNI;
 constexpr FeatureBitset FeaturesAlderlake =
     FeaturesSkylakeClient | FeatureCLDEMOTE | FeatureHRESET | FeaturePTWRITE |
-    FeatureSERIALIZE | FeatureWAITPKG;
+    FeatureSERIALIZE | FeatureWAITPKG | FeatureAVXVNNI;
 
 // Intel Atom processors.
 // Bonnell has feature parity with Core2 and adds MOVBE.
@@ -575,6 +575,9 @@ constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2;
 constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL;
 
+// AVXVNNI Features
+constexpr FeatureBitset ImpliedFeaturesAVXVNNI = FeatureAVX2;
+
 constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
 #define X86_FEATURE(ENUM, STR) {{STR}, ImpliedFeatures##ENUM},
 #include "llvm/Support/X86TargetParser.def"

diff  --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 122702a28f48..4952c78e1fc3 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3845,6 +3845,13 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
     return Match_Unsupported;
 
+  // These instructions are only available with {vex}, {vex2} or {vex3} prefix
+  if (MCID.TSFlags & X86II::ExplicitVEXPrefix &&
+      (ForcedVEXEncoding != VEXEncoding_VEX &&
+       ForcedVEXEncoding != VEXEncoding_VEX2 &&
+       ForcedVEXEncoding != VEXEncoding_VEX3))
+    return Match_Unsupported;
+
   // These instructions match ambiguously with their VEX encoded counterparts
   // and appear first in the matching table. Reject them unless we're forcing
   // EVEX encoding.

diff  --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 8b8fccfb5a41..4db1bfc25177 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -952,7 +952,11 @@ namespace X86II {
 
     // NOTRACK prefix
     NoTrackShift = EVEX_RCShift + 1,
-    NOTRACK = 1ULL << NoTrackShift
+    NOTRACK = 1ULL << NoTrackShift,
+
+    // Force VEX encoding
+    ExplicitVEXShift = NoTrackShift + 1,
+    ExplicitVEXPrefix = 1ULL << ExplicitVEXShift
   };
 
   /// \returns true if the instruction with given opcode is a prefix.

diff  --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 96f81faf15bb..d8dbbbbf2779 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -348,7 +348,7 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
     O << "\trep\t";
 
   // These all require a pseudo prefix
-  if (Flags & X86::IP_USE_VEX)
+  if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix))
     O << "\t{vex}";
   else if (Flags & X86::IP_USE_VEX2)
     O << "\t{vex2}";

diff  --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index ea09503a4a3b..9096d9d54452 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -171,6 +171,9 @@ def FeaturePKU   : SubtargetFeature<"pku", "HasPKU", "true",
 def FeatureVNNI    : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
                           "Enable AVX-512 Vector Neural Network Instructions",
                                       [FeatureAVX512]>;
+def FeatureAVXVNNI    : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true",
+                           "Support AVX_VNNI encoding",
+                                      [FeatureAVX2]>;
 def FeatureBF16    : SubtargetFeature<"avx512bf16", "HasBF16", "true",
                            "Support bfloat16 floating point",
                                       [FeatureBWI]>;
@@ -769,6 +772,7 @@ def ProcessorFeatures {
                                                   FeatureCLDEMOTE,
                                                   FeatureWAITPKG,
                                                   FeaturePTWRITE,
+                                                  FeatureAVXVNNI,
                                                   FeatureTSXLDTRK,
                                                   FeatureENQCMD,
                                                   FeatureSHSTK,
@@ -781,7 +785,8 @@ def ProcessorFeatures {
     !listconcat(ICXFeatures, SPRAdditionalFeatures);
 
   // Alderlake
-  list<SubtargetFeature> ADLAdditionalFeatures = [FeatureCLDEMOTE,
+  list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
+                                                  FeatureCLDEMOTE,
                                                   FeatureHRESET,
                                                   FeaturePTWRITE,
                                                   FeatureSERIALIZE,

diff  --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86EvexToVex.cpp
index 8155ce3d0bb6..97f843fa24eb 100644
--- a/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -85,6 +85,8 @@ class EvexToVexInstPass : public MachineFunctionPass {
 private:
   /// Machine instruction info used throughout the class.
   const X86InstrInfo *TII = nullptr;
+
+  const X86Subtarget *ST = nullptr;
 };
 
 } // end anonymous namespace
@@ -94,8 +96,8 @@ char EvexToVexInstPass::ID = 0;
 bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
 
-  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  if (!ST.hasAVX512())
+  ST = &MF.getSubtarget<X86Subtarget>();
+  if (!ST->hasAVX512())
     return false;
 
   bool Changed = false;
@@ -144,10 +146,29 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
 }
 
 // Do any custom cleanup needed to finalize the conversion.
-static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc,
+                                     const X86Subtarget *ST) {
   (void)NewOpc;
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
+  case X86::VPDPBUSDSZ256m:
+  case X86::VPDPBUSDSZ256r:
+  case X86::VPDPBUSDSZ128m:
+  case X86::VPDPBUSDSZ128r:
+  case X86::VPDPBUSDZ256m:
+  case X86::VPDPBUSDZ256r:
+  case X86::VPDPBUSDZ128m:
+  case X86::VPDPBUSDZ128r:
+  case X86::VPDPWSSDSZ256m:
+  case X86::VPDPWSSDSZ256r:
+  case X86::VPDPWSSDSZ128m:
+  case X86::VPDPWSSDSZ128r:
+  case X86::VPDPWSSDZ256m:
+  case X86::VPDPWSSDZ256r:
+  case X86::VPDPWSSDZ128m:
+  case X86::VPDPWSSDZ128r:
+    // These can only VEX convert if AVXVNNI is enabled.
+    return ST->hasAVXVNNI();
   case X86::VALIGNDZ128rri:
   case X86::VALIGNDZ128rmi:
   case X86::VALIGNQZ128rri:
@@ -259,7 +280,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
   if (usesExtendedRegister(MI))
     return false;
 
-  if (!performCustomAdjustments(MI, NewOpc))
+  if (!performCustomAdjustments(MI, NewOpc, ST))
     return false;
 
   MI.setDesc(TII->get(NewOpc));

diff  --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 41ea3a907874..17fe7f0bd310 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -3748,18 +3748,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VPCONFLICTQZ128rrk,         X86::VPCONFLICTQZ128rmk,         0 },
   { X86::VPCONFLICTQZ256rrk,         X86::VPCONFLICTQZ256rmk,         0 },
   { X86::VPCONFLICTQZrrk,            X86::VPCONFLICTQZrmk,            0 },
+  { X86::VPDPBUSDSYrr,               X86::VPDPBUSDSYrm,               0 },
   { X86::VPDPBUSDSZ128r,             X86::VPDPBUSDSZ128m,             0 },
   { X86::VPDPBUSDSZ256r,             X86::VPDPBUSDSZ256m,             0 },
   { X86::VPDPBUSDSZr,                X86::VPDPBUSDSZm,                0 },
+  { X86::VPDPBUSDSrr,                X86::VPDPBUSDSrm,                0 },
+  { X86::VPDPBUSDYrr,                X86::VPDPBUSDYrm,                0 },
   { X86::VPDPBUSDZ128r,              X86::VPDPBUSDZ128m,              0 },
   { X86::VPDPBUSDZ256r,              X86::VPDPBUSDZ256m,              0 },
   { X86::VPDPBUSDZr,                 X86::VPDPBUSDZm,                 0 },
+  { X86::VPDPBUSDrr,                 X86::VPDPBUSDrm,                 0 },
+  { X86::VPDPWSSDSYrr,               X86::VPDPWSSDSYrm,               0 },
   { X86::VPDPWSSDSZ128r,             X86::VPDPWSSDSZ128m,             0 },
   { X86::VPDPWSSDSZ256r,             X86::VPDPWSSDSZ256m,             0 },
   { X86::VPDPWSSDSZr,                X86::VPDPWSSDSZm,                0 },
+  { X86::VPDPWSSDSrr,                X86::VPDPWSSDSrm,                0 },
+  { X86::VPDPWSSDYrr,                X86::VPDPWSSDYrm,                0 },
   { X86::VPDPWSSDZ128r,              X86::VPDPWSSDZ128m,              0 },
   { X86::VPDPWSSDZ256r,              X86::VPDPWSSDZ256m,              0 },
   { X86::VPDPWSSDZr,                 X86::VPDPWSSDZm,                 0 },
+  { X86::VPDPWSSDrr,                 X86::VPDPWSSDrm,                 0 },
   { X86::VPERMBZ128rrkz,             X86::VPERMBZ128rmkz,             0 },
   { X86::VPERMBZ256rrkz,             X86::VPERMBZ256rmkz,             0 },
   { X86::VPERMBZrrkz,                X86::VPERMBZrmkz,                0 },

diff  --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 150e034c0bb6..686b19fc0a6c 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -264,6 +264,9 @@ class NotMemoryFoldable { bit isMemoryFoldable = 0; }
 // Prevent EVEX->VEX conversion from considering this instruction.
 class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
 
+// Force the instruction to use VEX encoding.
+class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; }
+
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
               string AsmStr, Domain d = GenericDomain>
   : Instruction {
@@ -348,6 +351,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
 
   bit isMemoryFoldable = 1;     // Is it allowed to memory fold/unfold this instruction?
   bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
+  bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.
 
   // TSFlags layout should be kept in sync with X86BaseInfo.h.
   let TSFlags{6-0}   = FormBits;
@@ -376,6 +380,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{51-45} = CD8_Scale;
   let TSFlags{52}    = hasEVEX_RC;
   let TSFlags{53}    = hasNoTrackPrefix;
+  let TSFlags{54}    = ExplicitVEXPrefix;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 45fef990e4e5..3ca826fd4194 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2568,6 +2568,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case X86::VPTERNLOGQZ256rmbikz:
   case X86::VPTERNLOGQZrmbikz:
     return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+  case X86::VPDPWSSDYrr:
+  case X86::VPDPWSSDrr:
+  case X86::VPDPWSSDSYrr:
+  case X86::VPDPWSSDSrr:
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128rk:
   case X86::VPDPWSSDZ128rkz:

diff  --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index cc51a32eb498..a8ea79506f22 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -910,6 +910,8 @@ def PKU        : Predicate<"Subtarget->hasPKU()">;
 def HasVNNI    : Predicate<"Subtarget->hasVNNI()">;
 def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
 def HasBF16      : Predicate<"Subtarget->hasBF16()">;
+def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
+def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
 
 def HasBITALG    : Predicate<"Subtarget->hasBITALG()">;
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;

diff  --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index dc833406fa19..36dd5cbe9e5f 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7164,6 +7164,48 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
                                  int_x86_avx_maskstore_pd_256,
                                  WriteFMaskMove64, WriteFMaskMove64Y>;
 
+//===----------------------------------------------------------------------===//
+// AVX_VNNI
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
+multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                       bit IsCommutable> {
+  let isCommutable = IsCommutable in
+  def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2, VR128:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
+                                       VR128:$src2, VR128:$src3)))]>,
+             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+  def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
+                                      (loadv4i32 addr:$src3))))]>,
+             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+  let isCommutable = IsCommutable in
+  def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, VR256:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
+                                       VR256:$src2, VR256:$src3)))]>,
+             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+
+  def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
+                                      (loadv8i32 addr:$src3))))]>,
+             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+}
+
+defm VPDPBUSD   : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
+defm VPDPBUSDS  : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
+defm VPDPWSSD   : avx_vnni_rm<0x52, "vpdpwssd",  X86Vpdpwssd, 1>, ExplicitVEXPrefix;
+defm VPDPWSSDS  : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
+
 //===----------------------------------------------------------------------===//
 // VPERMIL - Permute Single and Double Floating-Point Values
 //

diff  --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a189ea9d2caa..0b2362d6c10d 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -355,6 +355,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// Processor has AVX-512 Vector Neural Network Instructions
   bool HasVNNI = false;
 
+  /// Processor has AVX Vector Neural Network Instructions
+  bool HasAVXVNNI = false;
+
   /// Processor has AVX-512 bfloat16 floating-point extensions
   bool HasBF16 = false;
 
@@ -750,6 +753,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool useRetpolineIndirectBranches() const {
     return UseRetpolineIndirectBranches;
   }
+  bool hasAVXVNNI() const { return HasAVXVNNI; }
   bool hasAMXTILE() const { return HasAMXTILE; }
   bool hasAMXBF16() const { return HasAMXBF16; }
   bool hasAMXINT8() const { return HasAMXINT8; }

diff  --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
new file mode 100644
index 000000000000..a1db6e54fa79
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
+
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x50,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x50,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x50,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x50,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x51,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x51,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x51,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x51,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}

diff  --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
new file mode 100644
index 000000000000..4b0f63f9a638
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnni < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    {vex} vpdpbusd %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT:    {vex} vpdpbusd %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    {vex} vpdpbusds %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT:    {vex} vpdpbusds %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}

diff  --git a/llvm/test/MC/Disassembler/X86/avx_vnni.txt b/llvm/test/MC/Disassembler/X86/avx_vnni.txt
new file mode 100644
index 000000000000..b7744fa8d85b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx_vnni.txt
@@ -0,0 +1,170 @@
+# RUN: llvm-mc --disassemble %s -triple=i686-apple-darwin9 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd  268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd  291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x30
+
+# CHECK: {vex} vpdpbusd  -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd  291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x30
+
+# CHECK: {vex} vpdpbusd  -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds  268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds  291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x30
+
+# CHECK: {vex} vpdpbusds  -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds  291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x30
+
+# CHECK: {vex} vpdpbusds  -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd  268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd  291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x30
+
+# CHECK: {vex} vpdpwssd  -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd  291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x30
+
+# CHECK: {vex} vpdpwssd  -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds  268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds  291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x30
+
+# CHECK: {vex} vpdpwssds  -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds  291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x30
+
+# CHECK: {vex} vpdpwssds  -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+

diff  --git a/llvm/test/MC/Disassembler/X86/intel-syntax-avx_vnni.txt b/llvm/test/MC/Disassembler/X86/intel-syntax-avx_vnni.txt
new file mode 100644
index 000000000000..b21a0e551407
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/intel-syntax-avx_vnni.txt
@@ -0,0 +1,170 @@
+# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x50,0x30
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x50,0x30
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x51,0x30
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x51,0x30
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x52,0x30
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x52,0x30
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x53,0x30
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x53,0x30
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+

diff  --git a/llvm/test/MC/Disassembler/X86/intel-syntax-x86-64-avx_vnni.txt b/llvm/test/MC/Disassembler/X86/intel-syntax-x86-64-avx_vnni.txt
new file mode 100644
index 000000000000..ba57b283aa65
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/intel-syntax-x86-64-avx_vnni.txt
@@ -0,0 +1,170 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+

diff  --git a/llvm/test/MC/Disassembler/X86/x86-64-avx_vnni.txt b/llvm/test/MC/Disassembler/X86/x86-64-avx_vnni.txt
new file mode 100644
index 000000000000..043b428d4736
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/x86-64-avx_vnni.txt
@@ -0,0 +1,170 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd  291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd  291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds  291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds  291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd  291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd  291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds  291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds  291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+

diff  --git a/llvm/test/MC/X86/avx_vnni-encoding.s b/llvm/test/MC/X86/avx_vnni-encoding.s
new file mode 100644
index 000000000000..7baf2a2c4006
--- /dev/null
+++ b/llvm/test/MC/X86/avx_vnni-encoding.s
@@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni --show-encoding < %s  | FileCheck %s
+
+// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+          {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+          {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd  268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd  291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30]
+          {vex} vpdpbusd  (%eax), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusd  -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusd  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusd  -4096(%edx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd  268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd  291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30]
+          {vex} vpdpbusd  (%eax), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusd  -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusd  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusd  -2048(%edx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+          {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+          {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds  268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds  291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30]
+          {vex} vpdpbusds  (%eax), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusds  -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusds  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusds  -4096(%edx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds  268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds  291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30]
+          {vex} vpdpbusds  (%eax), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusds  -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusds  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusds  -2048(%edx), %xmm5, %xmm6
+
+// CHECK: vpdpwssd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+          {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+
+// CHECK: vpdpwssd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+          {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+
+// CHECK: vpdpwssd  268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd  268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd  291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30]
+          {vex} vpdpwssd  (%eax), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssd  -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssd  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssd  -4096(%edx), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd  268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd  291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30]
+          {vex} vpdpwssd  (%eax), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssd  -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssd  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssd  -2048(%edx), %xmm5, %xmm6
+
+// CHECK: vpdpwssds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+          {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+
+// CHECK: vpdpwssds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+          {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+
+// CHECK: vpdpwssds  268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds  268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds  291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30]
+          {vex} vpdpwssds  (%eax), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssds  -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssds  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssds  -4096(%edx), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds  268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds  291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30]
+          {vex} vpdpwssds  (%eax), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssds  -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssds  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssds  -2048(%edx), %xmm5, %xmm6
+

diff  --git a/llvm/test/MC/X86/intel-syntax-avx_vnni.s b/llvm/test/MC/X86/intel-syntax-avx_vnni.s
new file mode 100644
index 000000000000..b905ed82f0c2
--- /dev/null
+++ b/llvm/test/MC/X86/intel-syntax-avx_vnni.s
@@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+          {vex} vpdpbusd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+          {vex} vpdpbusd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+          {vex} vpdpbusds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+          {vex} vpdpbusds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+          {vex} vpdpwssd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+          {vex} vpdpwssd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+          {vex} vpdpwssds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+          {vex} vpdpwssds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
+

diff  --git a/llvm/test/MC/X86/intel-syntax-x86-64-avx_vnni.s b/llvm/test/MC/X86/intel-syntax-x86-64-avx_vnni.s
new file mode 100644
index 000000000000..48966b3a0736
--- /dev/null
+++ b/llvm/test/MC/X86/intel-syntax-x86-64-avx_vnni.s
@@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+          {vex} vpdpbusd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+          {vex} vpdpbusd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+          {vex} vpdpbusds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+          {vex} vpdpbusds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+          {vex} vpdpwssd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+          {vex} vpdpwssd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+          {vex} vpdpwssds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+          {vex} vpdpwssds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
+

diff  --git a/llvm/test/MC/X86/x86-64-avx_vnni-encoding.s b/llvm/test/MC/X86/x86-64-avx_vnni-encoding.s
new file mode 100644
index 000000000000..8fc7f113b002
--- /dev/null
+++ b/llvm/test/MC/X86/x86-64-avx_vnni-encoding.s
@@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni --show-encoding < %s  | FileCheck %s
+
+// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+          {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+          {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd  291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusd  (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusd  -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusd  4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusd  -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd  291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusd  (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusd  -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusd  2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusd  -2048(%rdx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+          {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+          {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds  291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusds  (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusds  -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusds  4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusds  -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds  291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusds  (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusds  -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusds  2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusds  -2048(%rdx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+          {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+          {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd  291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssd  (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssd  -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssd  4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssd  -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd  291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssd  (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssd  -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssd  2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssd  -2048(%rdx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+          {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+          {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds  291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssds  (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssds  -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssds  4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssds  -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds  291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssds  (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssds  -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssds  2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssds  -2048(%rdx), %xmm5, %xmm6
+


        


More information about the cfe-commits mailing list