[llvm] 0e720e6 - [X86] Add AVX-IFMA instructions.
Freddy Ye via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 27 18:42:45 PDT 2022
Author: Freddy Ye
Date: 2022-10-28T09:42:30+08:00
New Revision: 0e720e6adad13d9a3d29dc41e5c62240047acf55
URL: https://github.com/llvm/llvm-project/commit/0e720e6adad13d9a3d29dc41e5c62240047acf55
DIFF: https://github.com/llvm/llvm-project/commit/0e720e6adad13d9a3d29dc41e5c62240047acf55.diff
LOG: [X86] Add AVX-IFMA instructions.
For more details about these instructions, please refer to the latest ISE document: https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html
Reviewed By: pengfei, skan
Differential Revision: https://reviews.llvm.org/D135932
Added:
clang/lib/Headers/avxifmaintrin.h
clang/test/CodeGen/avxifma-builtins.c
llvm/test/CodeGen/X86/avx-ifma-intrinsics.ll
llvm/test/CodeGen/X86/stack-folding-int-avx512ifma.ll
llvm/test/CodeGen/X86/stack-folding-int-avxifma.ll
llvm/test/MC/Disassembler/X86/avx-ifma-32.txt
llvm/test/MC/Disassembler/X86/avx-ifma-64.txt
llvm/test/MC/X86/avx-ifma-att-32.s
llvm/test/MC/X86/avx-ifma-att-64.s
llvm/test/MC/X86/avx-ifma-intel-32.s
llvm/test/MC/X86/avx-ifma-intel-64.s
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Basic/BuiltinsX86.def
clang/include/clang/Driver/Options.td
clang/lib/Basic/Targets/X86.cpp
clang/lib/Basic/Targets/X86.h
clang/lib/Headers/CMakeLists.txt
clang/lib/Headers/avx512ifmavlintrin.h
clang/lib/Headers/cpuid.h
clang/lib/Headers/immintrin.h
clang/test/CodeGen/attr-target-x86.c
clang/test/Driver/x86-target-features.c
clang/test/Preprocessor/x86_target_features.c
llvm/docs/ReleaseNotes.rst
llvm/include/llvm/Support/X86TargetParser.def
llvm/lib/Support/Host.cpp
llvm/lib/Support/X86TargetParser.cpp
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86InstrFoldTables.cpp
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86InstrInfo.td
llvm/lib/Target/X86/X86InstrSSE.td
Removed:
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bc27c82b36216..d40475cd92b84 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -645,6 +645,9 @@ X86 Support in Clang
* Support intrinsic of ``_aand_i32/64``
* Support intrinsic of ``_aor_i32/64``
* Support intrinsic of ``_axor_i32/64``
+- Support ISA of ``AVX-IFMA``.
+ * Support intrinsic of ``_mm(256)_madd52hi_avx_epu64``.
+ * Support intrinsic of ``_mm(256)_madd52lo_avx_epu64``.
WebAssembly Support in Clang
----------------------------
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 24f958f17e310..d7ed93885020f 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1328,10 +1328,10 @@ TARGET_BUILTIN(__builtin_ia32_movdqa64store128_mask, "vV2Oi*V2OiUc", "nV:128:",
TARGET_BUILTIN(__builtin_ia32_movdqa64store256_mask, "vV4Oi*V4OiUc", "nV:256:", "avx512vl")
TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma")
TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512, "V8OiV8OiV8OiV8Oi", "ncV:512:", "avx512ifma")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl")
-TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128, "V2OiV2OiV2OiV2Oi", "ncV:128:", "avx512ifma,avx512vl|avxifma")
+TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256, "V4OiV4OiV4OiV4Oi", "ncV:256:", "avx512ifma,avx512vl|avxifma")
TARGET_BUILTIN(__builtin_ia32_vcomisd, "iV2dV2dIiIi", "ncV:128:", "avx512f")
TARGET_BUILTIN(__builtin_ia32_vcomiss, "iV4fV4fIiIi", "ncV:128:", "avx512f")
TARGET_BUILTIN(__builtin_ia32_kunpckdi, "UOiUOiUOi", "nc", "avx512bw")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index bc30ca9deb8a1..0e80afa73b2a4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4588,6 +4588,8 @@ def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Gro
def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Features_Group>;
def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
+def mavxifma : Flag<["-"], "mavxifma">, Group<m_x86_Features_Group>;
+def mno_avxifma : Flag<["-"], "mno-avxifma">, Group<m_x86_Features_Group>;
def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group<m_x86_Features_Group>;
def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index f1e1facb56701..78c032e2d7d4b 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -338,6 +338,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasCMPCCXADD = true;
} else if (Feature == "+raoint") {
HasRAOINT = true;
+ } else if (Feature == "+avxifma") {
+ HasAVXIFMA = true;
} else if (Feature == "+avxvnni") {
HasAVXVNNI = true;
} else if (Feature == "+serialize") {
@@ -790,6 +792,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__CMPCCXADD__");
if (HasRAOINT)
Builder.defineMacro("__RAOINT__");
+ if (HasAVXIFMA)
+ Builder.defineMacro("__AVXIFMA__");
if (HasAVXVNNI)
Builder.defineMacro("__AVXVNNI__");
if (HasSERIALIZE)
@@ -914,6 +918,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avx512vbmi2", true)
.Case("avx512ifma", true)
.Case("avx512vp2intersect", true)
+ .Case("avxifma", true)
.Case("avxvnni", true)
.Case("bmi", true)
.Case("bmi2", true)
@@ -994,7 +999,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-fp16", HasAMXFP16)
.Case("amx-int8", HasAMXINT8)
.Case("amx-tile", HasAMXTILE)
- .Case("avxvnni", HasAVXVNNI)
.Case("avx", SSELevel >= AVX)
.Case("avx2", SSELevel >= AVX2)
.Case("avx512f", SSELevel >= AVX512F)
@@ -1013,6 +1017,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("avx512vbmi2", HasAVX512VBMI2)
.Case("avx512ifma", HasAVX512IFMA)
.Case("avx512vp2intersect", HasAVX512VP2INTERSECT)
+ .Case("avxifma", HasAVXIFMA)
+ .Case("avxvnni", HasAVXVNNI)
.Case("bmi", HasBMI)
.Case("bmi2", HasBMI2)
.Case("cldemote", HasCLDEMOTE)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index f48ab1d5c19a1..7b67a4060ec3b 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -104,6 +104,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAVX512VL = false;
bool HasAVX512VBMI = false;
bool HasAVX512VBMI2 = false;
+ bool HasAVXIFMA = false;
bool HasAVX512IFMA = false;
bool HasAVX512VP2INTERSECT = false;
bool HasSHA = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 23355068622f3..5a7f81b4ed07d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -142,6 +142,7 @@ set(x86_files
avx512vp2intersectintrin.h
avx512vpopcntdqintrin.h
avx512vpopcntdqvlintrin.h
+ avxifmaintrin.h
avxintrin.h
avxvnniintrin.h
bmi2intrin.h
diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
index 5889401d10553..3284ee182004b 100644
--- a/clang/lib/Headers/avx512ifmavlintrin.h
+++ b/clang/lib/Headers/avx512ifmavlintrin.h
@@ -18,14 +18,21 @@
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
+#define _mm_madd52hi_epu64(X, Y, Z) \
+ ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \
+ (__v2di)(Z)))
+#define _mm256_madd52hi_epu64(X, Y, Z) \
+ ((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y), \
+ (__v4di)(Z)))
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-{
- return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
- (__v2di) __Z);
-}
+#define _mm_madd52lo_epu64(X, Y, Z) \
+ ((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y), \
+ (__v2di)(Z)))
+
+#define _mm256_madd52lo_epu64(X, Y, Z) \
+ ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \
+ (__v4di)(Z)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
@@ -43,13 +50,6 @@ _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
(__v2di)_mm_setzero_si128());
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-{
- return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
- (__v4di)__Z);
-}
-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
@@ -66,13 +66,6 @@ _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z
(__v4di)_mm256_setzero_si256());
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-{
- return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
- (__v2di)__Z);
-}
-
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
@@ -89,13 +82,6 @@ _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
(__v2di)_mm_setzero_si128());
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-{
- return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
- (__v4di)__Z);
-}
-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
new file mode 100644
index 0000000000000..5c782d2a5b865
--- /dev/null
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -0,0 +1,177 @@
+/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXIFMAINTRIN_H
+#define __AVXIFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
+ __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
+ __min_vector_width__(256)))
+
+// must vex-encoding
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i
+/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
+///
+/// \return
+/// return __m128i dst.
+/// \param __X
+/// A 128-bit vector of [2 x i64]
+/// \param __Y
+/// A 128-bit vector of [2 x i64]
+/// \param __Z
+/// A 128-bit vector of [2 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 1
+/// i := j*64
+/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+ return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
+ (__v2di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i
+/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
+///
+/// \return
+/// return __m256i dst.
+/// \param __X
+/// A 256-bit vector of [4 x i64]
+/// \param __Y
+/// A 256-bit vector of [4 x i64]
+/// \param __Z
+/// A 256-bit vector of [4 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// i := j*64
+/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+ return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+ (__v4di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i
+/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
+///
+/// \return
+/// return __m128i dst.
+/// \param __X
+/// A 128-bit vector of [2 x i64]
+/// \param __Y
+/// A 128-bit vector of [2 x i64]
+/// \param __Z
+/// A 128-bit vector of [2 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 1
+/// i := j*64
+/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+ return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+ (__v2di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i
+/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
+///
+/// \return
+/// return __m256i dst.
+/// \param __X
+/// A 256-bit vector of [4 x i64]
+/// \param __Y
+/// A 256-bit vector of [4 x i64]
+/// \param __Z
+/// A 256-bit vector of [4 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// i := j*64
+/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+ return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+ (__v4di)__Z);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXIFMAINTRIN_H
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 4af26ecd8fc08..2e674ef6d96c3 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -206,6 +206,7 @@
#define bit_CMPCCXADD 0x00000080
#define bit_AMXFP16 0x00200000
#define bit_HRESET 0x00400000
+#define bit_AVXIFMA 0x00800000
/* Features in %edx for leaf 7 sub-leaf 1 */
#define bit_PREFETCHI 0x00004000
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index abae5410f2eaf..00ee91b364aeb 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -189,6 +189,11 @@
#include <avx512ifmavlintrin.h>
#endif
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVXIFMA__)
+#include <avxifmaintrin.h>
+#endif
+
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVX512VBMI__)
#include <avx512vbmiintrin.h>
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index 653033a649c81..604e3152debad 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -54,9 +54,9 @@ void __attribute__((target("arch=x86-64-v4"))) x86_64_v4(void) {}
// CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686"
// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
// CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
// CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
// CHECK-NOT: tune-cpu
// CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx"
diff --git a/clang/test/CodeGen/avxifma-builtins.c b/clang/test/CodeGen/avxifma-builtins.c
new file mode 100644
index 0000000000000..56e434cd7b164
--- /dev/null
+++ b/clang/test/CodeGen/avxifma-builtins.c
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+// CHECK-LABEL: @test_mm_madd52hi_epu64
+// CHECK: call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128
+ return _mm_madd52hi_epu64(__X, __Y, __Z);
+}
+
+__m256i test_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+// CHECK-LABEL: @test_mm256_madd52hi_epu64
+// CHECK: call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256
+ return _mm256_madd52hi_epu64(__X, __Y, __Z);
+}
+
+__m128i test_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+// CHECK-LABEL: @test_mm_madd52lo_epu64
+// CHECK: call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128
+ return _mm_madd52lo_epu64(__X, __Y, __Z);
+}
+
+__m256i test_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+// CHECK-LABEL: @test_mm256_madd52lo_epu64
+// CHECK: call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256
+ return _mm256_madd52lo_epu64(__X, __Y, __Z);
+}
+
+__m128i test_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+// CHECK-LABEL: @test_mm_madd52hi_avx_epu64
+// CHECK: call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128
+ return _mm_madd52hi_avx_epu64(__X, __Y, __Z);
+}
+
+__m256i test_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+// CHECK-LABEL: @test_mm256_madd52hi_avx_epu64
+// CHECK: call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256
+ return _mm256_madd52hi_avx_epu64(__X, __Y, __Z);
+}
+
+__m128i test_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+// CHECK-LABEL: @test_mm_madd52lo_avx_epu64
+// CHECK: call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128
+ return _mm_madd52lo_avx_epu64(__X, __Y, __Z);
+}
+
+__m256i test_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+// CHECK-LABEL: @test_mm256_madd52lo_avx_epu64
+// CHECK: call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256
+ return _mm256_madd52lo_avx_epu64(__X, __Y, __Z);
+}
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 5440b143ea3e0..0954ad135b328 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -327,6 +327,11 @@
// RAOINT: "-target-feature" "+raoint"
// NO-RAOINT: "-target-feature" "-raoint"
+// RUN: %clang -target i386-linux-gnu -mavxifma %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVXIFMA %s
+// RUN: %clang -target i386-linux-gnu -mno-avxifma %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVXIFMA %s
+// AVXIFMA: "-target-feature" "+avxifma"
+// NO-AVXIFMA: "-target-feature" "-avxifma"
+
// RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s
// RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s
// CRC32: "-target-feature" "+crc32"
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 5f2e244125a2a..2db998f4ee822 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -602,6 +602,15 @@
// RUN: %clang -target x86_64-unknown-linux-gnu -march=atom -mno-cmpccxadd -x c -E -dM -o - %s | FileCheck -check-prefix=NO-CMPCCXADD %s
// NO-CMPCCXADD-NOT: #define __CMPCCXADD__ 1
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxifma -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXIFMA %s
+
+// AVXIFMA: #define __AVX2__ 1
+// AVXIFMA: #define __AVXIFMA__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxifma -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXIFMANOAVX2 %s
+
+// AVXIFMANOAVX2-NOT: #define __AVX2__ 1
+// AVXIFMANOAVX2-NOT: #define __AVXIFMA__ 1
// RUN: %clang -target i386-unknown-linux-gnu -march=atom -mraoint -x c -E -dM -o - %s | FileCheck -check-prefix=RAOINT %s
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 3918899d72229..d892d92297e4b 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -135,6 +135,7 @@ Changes to the Windows Target
Changes to the X86 Backend
--------------------------
+* Support ISA of ``AVX-IFMA``.
* Add support for the ``RDMSRLIST and WRMSRLIST`` instructions.
* Add support for the ``WRMSRNS`` instruction.
diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def
index f03666fbfa047..e8e50acf1771b 100644
--- a/llvm/include/llvm/Support/X86TargetParser.def
+++ b/llvm/include/llvm/Support/X86TargetParser.def
@@ -206,6 +206,7 @@ X86_FEATURE (AVX512FP16, "avx512fp16")
X86_FEATURE (AMX_FP16, "amx-fp16")
X86_FEATURE (CMPCCXADD, "cmpccxadd")
X86_FEATURE (AVXVNNI, "avxvnni")
+X86_FEATURE (AVXIFMA, "avxifma")
// These features aren't really CPU features, but the frontend can set them.
X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk")
X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index fac2b017ed89c..96663c8d40e43 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -1811,6 +1811,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
Features["amx-fp16"] = HasLeaf7Subleaf1 && ((EAX >> 21) & 1) && HasAMXSave;
Features["cmpccxadd"] = HasLeaf7Subleaf1 && ((EAX >> 7) & 1);
Features["hreset"] = HasLeaf7Subleaf1 && ((EAX >> 22) & 1);
+ Features["avxifma"] = HasLeaf7Subleaf1 && ((EAX >> 23) & 1) && HasAVXSave;
Features["prefetchi"] = HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
bool HasLeafD = MaxLevel >= 0xd &&
diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
index 96aee2159e5bc..2b0a3bb881201 100644
--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -585,6 +585,7 @@ constexpr FeatureBitset ImpliedFeaturesHRESET = {};
constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {};
constexpr FeatureBitset ImpliedFeaturesCMPCCXADD = {};
constexpr FeatureBitset ImpliedFeaturesRAOINT = {};
+constexpr FeatureBitset ImpliedFeaturesAVXIFMA = FeatureAVX2;
constexpr FeatureBitset ImpliedFeaturesAVX512FP16 =
FeatureAVX512BW | FeatureAVX512DQ | FeatureAVX512VL;
// Key Locker Features
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 078a6c24251ac..0f0cf69ed012e 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -155,6 +155,9 @@ def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true",
"Enable AVX-512 further Vector Byte Manipulation Instructions",
[FeatureBWI]>;
+def FeatureAVXIFMA : SubtargetFeature<"avxifma", "HasAVXIFMA", "true",
+ "Enable AVX-IFMA",
+ [FeatureAVX2]>;
def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
"Enable AVX-512 Integer Fused Multiple-Add",
[FeatureAVX512]>;
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 8aeb169929f2d..9d58889814037 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -4103,12 +4103,16 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VPLZCNTQZ128rrk, X86::VPLZCNTQZ128rmk, 0 },
{ X86::VPLZCNTQZ256rrk, X86::VPLZCNTQZ256rmk, 0 },
{ X86::VPLZCNTQZrrk, X86::VPLZCNTQZrmk, 0 },
+ { X86::VPMADD52HUQYrr, X86::VPMADD52HUQYrm, 0 },
{ X86::VPMADD52HUQZ128r, X86::VPMADD52HUQZ128m, 0 },
{ X86::VPMADD52HUQZ256r, X86::VPMADD52HUQZ256m, 0 },
{ X86::VPMADD52HUQZr, X86::VPMADD52HUQZm, 0 },
+ { X86::VPMADD52HUQrr, X86::VPMADD52HUQrm, 0 },
+ { X86::VPMADD52LUQYrr, X86::VPMADD52LUQYrm, 0 },
{ X86::VPMADD52LUQZ128r, X86::VPMADD52LUQZ128m, 0 },
{ X86::VPMADD52LUQZ256r, X86::VPMADD52LUQZ256m, 0 },
{ X86::VPMADD52LUQZr, X86::VPMADD52LUQZm, 0 },
+ { X86::VPMADD52LUQrr, X86::VPMADD52LUQrm, 0 },
{ X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
{ X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
{ X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index e57d0ffbc9409..4f17635b42df1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2573,6 +2573,8 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case X86::VPDPWSSDSZr:
case X86::VPDPWSSDSZrk:
case X86::VPDPWSSDSZrkz:
+ case X86::VPMADD52HUQrr:
+ case X86::VPMADD52HUQYrr:
case X86::VPMADD52HUQZ128r:
case X86::VPMADD52HUQZ128rk:
case X86::VPMADD52HUQZ128rkz:
@@ -2582,6 +2584,8 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case X86::VPMADD52HUQZr:
case X86::VPMADD52HUQZrk:
case X86::VPMADD52HUQZrkz:
+ case X86::VPMADD52LUQrr:
+ case X86::VPMADD52LUQYrr:
case X86::VPMADD52LUQZ128r:
case X86::VPMADD52LUQZ128rk:
case X86::VPMADD52LUQZ128rkz:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index a9eca99919168..8e038f4c00a30 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -957,6 +957,8 @@ def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">;
def HasVBMI : Predicate<"Subtarget->hasVBMI()">;
def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">;
def HasIFMA : Predicate<"Subtarget->hasIFMA()">;
+def HasAVXIFMA : Predicate<"Subtarget->hasAVXIFMA()">;
+def NoVLX_Or_NoIFMA : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasIFMA()">;
def HasRTM : Predicate<"Subtarget->hasRTM()">;
def HasADX : Predicate<"Subtarget->hasADX()">;
def HasSHA : Predicate<"Subtarget->hasSHA()">;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index c792270280e2b..a34eeb60f7ed5 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8123,3 +8123,40 @@ let isCommutable = 0 in {
X86GF2P8affineqb>, TAPD;
}
+let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst",
+ checkVEXPredicate = 1 in
+multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+ // NOTE: The SDNode have the multiply operands first with the add last.
+ // This enables commuted load patterns to be autogenerated by tablegen.
+ let isCommutable = 1 in {
+ def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
+ VR128:$src3, VR128:$src1)))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+ }
+ def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
+ (loadv2i64 addr:$src3), VR128:$src1)))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+ let isCommutable = 1 in {
+ def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
+ VR256:$src3, VR256:$src1)))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+ }
+ def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
+ (loadv4i64 addr:$src3), VR256:$src1)))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+}
+
+defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, VEX_W, ExplicitVEXPrefix;
+defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, VEX_W, ExplicitVEXPrefix;
diff --git a/llvm/test/CodeGen/X86/avx-ifma-intrinsics.ll b/llvm/test/CodeGen/X86/avx-ifma-intrinsics.ll
new file mode 100644
index 0000000000000..f2f1d9d910356
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx-ifma-intrinsics.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxifma --show-mc-encoding | FileCheck %s --check-prefix=AVXIFMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxifma --show-mc-encoding | FileCheck %s --check-prefix=AVXIFMA
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxifma,+avx512ifma,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=AVX512IFMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxifma,+avx512ifma,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=AVX512IFMA
+
+declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+
+define <2 x i64>@test_int_x86_avx_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVXIFMA-LABEL: test_int_x86_avx_vpmadd52h_uq_128:
+; AVXIFMA: # %bb.0:
+; AVXIFMA-NEXT: {vex} vpmadd52huq %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xb5,0xc2]
+; AVXIFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512IFMA-LABEL: test_int_x86_avx_vpmadd52h_uq_128:
+; AVX512IFMA: # %bb.0:
+; AVX512IFMA-NEXT: {vex} vpmadd52huq %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xb5,0xc2]
+; AVX512IFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ ret <2 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+
+define <4 x i64>@test_int_x86_avx_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
+; AVXIFMA-LABEL: test_int_x86_avx_vpmadd52h_uq_256:
+; AVXIFMA: # %bb.0:
+; AVXIFMA-NEXT: {vex} vpmadd52huq %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xb5,0xc2]
+; AVXIFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512IFMA-LABEL: test_int_x86_avx_vpmadd52h_uq_256:
+; AVX512IFMA: # %bb.0:
+; AVX512IFMA-NEXT: {vex} vpmadd52huq %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xb5,0xc2]
+; AVX512IFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ ret <4 x i64> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+
+define <2 x i64>@test_int_x86_avx_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVXIFMA-LABEL: test_int_x86_avx_vpmadd52l_uq_128:
+; AVXIFMA: # %bb.0:
+; AVXIFMA-NEXT: {vex} vpmadd52luq %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xb4,0xc2]
+; AVXIFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512IFMA-LABEL: test_int_x86_avx_vpmadd52l_uq_128:
+; AVX512IFMA: # %bb.0:
+; AVX512IFMA-NEXT: {vex} vpmadd52luq %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xb4,0xc2]
+; AVX512IFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ ret <2 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+
+define <4 x i64>@test_int_x86_avx_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
+; AVXIFMA-LABEL: test_int_x86_avx_vpmadd52l_uq_256:
+; AVXIFMA: # %bb.0:
+; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xb4,0xc2]
+; AVXIFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512IFMA-LABEL: test_int_x86_avx_vpmadd52l_uq_256:
+; AVX512IFMA: # %bb.0:
+; AVX512IFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xb4,0xc2]
+; AVX512IFMA-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ ret <4 x i64> %res
+}
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512ifma.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512ifma.ll
new file mode 100644
index 0000000000000..a1c8c3534e0cd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512ifma.ll
@@ -0,0 +1,217 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl < %s | FileCheck %s
+
+declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
+
+define <8 x i64> @stack_fold_vpmadd52huq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52huq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @stack_fold_vpmadd52huq_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52huq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @stack_fold_vpmadd52huq_mask(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+; CHECK-LABEL: stack_fold_vpmadd52huq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x i64>, ptr %a0
+ %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %2, <8 x i64> %a1, <8 x i64> %a2)
+ %4 = bitcast i8 %mask to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
+ ret <8 x i64> %5
+}
+
+define <8 x i64> @stack_fold_vpmadd52huq_mask_commuted(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+; CHECK-LABEL: stack_fold_vpmadd52huq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x i64>, ptr %a0
+ %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %2, <8 x i64> %a2, <8 x i64> %a1)
+ %4 = bitcast i8 %mask to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
+ ret <8 x i64> %5
+}
+
+define <8 x i64> @stack_fold_vpmadd52huq_maskz(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
+; CHECK-LABEL: stack_fold_vpmadd52huq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
+ %3 = load i8, ptr %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ ret <8 x i64> %5
+}
+
+define <8 x i64> @stack_fold_vpmadd52huq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
+; CHECK-LABEL: stack_fold_vpmadd52huq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
+ %3 = load i8, ptr %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ ret <8 x i64> %5
+}
+
+define <8 x i64> @stack_fold_vpmadd52luq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52luq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @stack_fold_vpmadd52luq_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52luq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @stack_fold_vpmadd52luq_mask(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+; CHECK-LABEL: stack_fold_vpmadd52luq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x i64>, ptr %a0
+ %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %2, <8 x i64> %a1, <8 x i64> %a2)
+ %4 = bitcast i8 %mask to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
+ ret <8 x i64> %5
+}
+
+define <8 x i64> @stack_fold_vpmadd52luq_mask_commuted(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
+; CHECK-LABEL: stack_fold_vpmadd52luq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <8 x i64>, ptr %a0
+ %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %2, <8 x i64> %a2, <8 x i64> %a1)
+ %4 = bitcast i8 %mask to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
+ ret <8 x i64> %5
+}
+
+define <8 x i64> @stack_fold_vpmadd52luq_maskz(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
+; CHECK-LABEL: stack_fold_vpmadd52luq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
+ %3 = load i8, ptr %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ ret <8 x i64> %5
+}
+
+define <8 x i64> @stack_fold_vpmadd52luq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
+; CHECK-LABEL: stack_fold_vpmadd52luq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
+ %3 = load i8, ptr %mask
+ %4 = bitcast i8 %3 to <8 x i1>
+ %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
+ ret <8 x i64> %5
+}
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxifma.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxifma.ll
new file mode 100644
index 0000000000000..95d21a4d981cc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxifma.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxifma < %s | FileCheck %s
+
+declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+
+define <2 x i64> @stack_fold_vpmadd52huq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52huq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2)
+ ret <2 x i64> %2
+}
+
+define <2 x i64> @stack_fold_vpmadd52huq_commuted(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52huq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %a0, <2 x i64> %a2, <2 x i64> %a1)
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @stack_fold_vpmadd52huq_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52huq_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2)
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @stack_fold_vpmadd52huq_256_commuted(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52huq_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %a0, <4 x i64> %a2, <4 x i64> %a1)
+ ret <4 x i64> %2
+}
+
+define <2 x i64> @stack_fold_vpmadd52luq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52luq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2)
+ ret <2 x i64> %2
+}
+
+define <2 x i64> @stack_fold_vpmadd52luq_commuted(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52luq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %a0, <2 x i64> %a2, <2 x i64> %a1)
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @stack_fold_vpmadd52luq_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52luq_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2)
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @stack_fold_vpmadd52luq_256_commuted(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; CHECK-LABEL: stack_fold_vpmadd52luq_256_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: {vex} vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %a0, <4 x i64> %a2, <4 x i64> %a1)
+ ret <4 x i64> %2
+}
diff --git a/llvm/test/MC/Disassembler/X86/avx-ifma-32.txt b/llvm/test/MC/Disassembler/X86/avx-ifma-32.txt
new file mode 100644
index 0000000000000..710c66f222c78
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-ifma-32.txt
@@ -0,0 +1,115 @@
+# RUN: llvm-mc --disassemble %s -triple=i686 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: {vex} vpmadd52huq %ymm4, %ymm3, %ymm2
+# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymm4
+0xc4,0xe2,0xe5,0xb5,0xd4
+
+# ATT: {vex} vpmadd52huq %xmm4, %xmm3, %xmm2
+# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmm4
+0xc4,0xe2,0xe1,0xb5,0xd4
+
+# ATT: {vex} vpmadd52huq 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0xe5,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: {vex} vpmadd52huq 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0xe5,0xb5,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: {vex} vpmadd52huq (%eax), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0xe5,0xb5,0x10
+
+# ATT: {vex} vpmadd52huq -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0xe5,0xb5,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: {vex} vpmadd52huq 4064(%ecx), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0xe5,0xb5,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT: {vex} vpmadd52huq -4096(%edx), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0xe5,0xb5,0x92,0x00,0xf0,0xff,0xff
+
+# ATT: {vex} vpmadd52huq 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0xe1,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: {vex} vpmadd52huq 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0xe1,0xb5,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: {vex} vpmadd52huq (%eax), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0xe1,0xb5,0x10
+
+# ATT: {vex} vpmadd52huq -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0xe1,0xb5,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: {vex} vpmadd52huq 2032(%ecx), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0xe1,0xb5,0x91,0xf0,0x07,0x00,0x00
+
+# ATT: {vex} vpmadd52huq -2048(%edx), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0xe1,0xb5,0x92,0x00,0xf8,0xff,0xff
+
+# ATT: {vex} vpmadd52luq %ymm4, %ymm3, %ymm2
+# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymm4
+0xc4,0xe2,0xe5,0xb4,0xd4
+
+# ATT: {vex} vpmadd52luq %xmm4, %xmm3, %xmm2
+# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmm4
+0xc4,0xe2,0xe1,0xb4,0xd4
+
+# ATT: {vex} vpmadd52luq 268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0xe5,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: {vex} vpmadd52luq 291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0xe5,0xb4,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: {vex} vpmadd52luq (%eax), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0xe5,0xb4,0x10
+
+# ATT: {vex} vpmadd52luq -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0xe5,0xb4,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: {vex} vpmadd52luq 4064(%ecx), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0xe5,0xb4,0x91,0xe0,0x0f,0x00,0x00
+
+# ATT: {vex} vpmadd52luq -4096(%edx), %ymm3, %ymm2
+# INTEL: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edx - 4096]
+0xc4,0xe2,0xe5,0xb4,0x92,0x00,0xf0,0xff,0xff
+
+# ATT: {vex} vpmadd52luq 268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0xe1,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT: {vex} vpmadd52luq 291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0xe1,0xb4,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT: {vex} vpmadd52luq (%eax), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0xe1,0xb4,0x10
+
+# ATT: {vex} vpmadd52luq -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0xe1,0xb4,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: {vex} vpmadd52luq 2032(%ecx), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0xe1,0xb4,0x91,0xf0,0x07,0x00,0x00
+
+# ATT: {vex} vpmadd52luq -2048(%edx), %xmm3, %xmm2
+# INTEL: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edx - 2048]
+0xc4,0xe2,0xe1,0xb4,0x92,0x00,0xf8,0xff,0xff
+
diff --git a/llvm/test/MC/Disassembler/X86/avx-ifma-64.txt b/llvm/test/MC/Disassembler/X86/avx-ifma-64.txt
new file mode 100644
index 0000000000000..f862a806cd077
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-ifma-64.txt
@@ -0,0 +1,115 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: {vex} vpmadd52huq %ymm14, %ymm13, %ymm12
+# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymm14
+0xc4,0x42,0x95,0xb5,0xe6
+
+# ATT: {vex} vpmadd52huq %xmm14, %xmm13, %xmm12
+# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmm14
+0xc4,0x42,0x91,0xb5,0xe6
+
+# ATT: {vex} vpmadd52huq 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x95,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: {vex} vpmadd52huq 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x95,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: {vex} vpmadd52huq (%rip), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x95,0xb5,0x25,0x00,0x00,0x00,0x00
+
+# ATT: {vex} vpmadd52huq -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x95,0xb5,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: {vex} vpmadd52huq 4064(%rcx), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x95,0xb5,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT: {vex} vpmadd52huq -4096(%rdx), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x95,0xb5,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT: {vex} vpmadd52huq 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x91,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: {vex} vpmadd52huq 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x91,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: {vex} vpmadd52huq (%rip), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x91,0xb5,0x25,0x00,0x00,0x00,0x00
+
+# ATT: {vex} vpmadd52huq -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x91,0xb5,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: {vex} vpmadd52huq 2032(%rcx), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x91,0xb5,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT: {vex} vpmadd52huq -2048(%rdx), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x91,0xb5,0xa2,0x00,0xf8,0xff,0xff
+
+# ATT: {vex} vpmadd52luq %ymm14, %ymm13, %ymm12
+# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymm14
+0xc4,0x42,0x95,0xb4,0xe6
+
+# ATT: {vex} vpmadd52luq %xmm14, %xmm13, %xmm12
+# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmm14
+0xc4,0x42,0x91,0xb4,0xe6
+
+# ATT: {vex} vpmadd52luq 268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x95,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: {vex} vpmadd52luq 291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x95,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: {vex} vpmadd52luq (%rip), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x95,0xb4,0x25,0x00,0x00,0x00,0x00
+
+# ATT: {vex} vpmadd52luq -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x95,0xb4,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT: {vex} vpmadd52luq 4064(%rcx), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rcx + 4064]
+0xc4,0x62,0x95,0xb4,0xa1,0xe0,0x0f,0x00,0x00
+
+# ATT: {vex} vpmadd52luq -4096(%rdx), %ymm13, %ymm12
+# INTEL: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rdx - 4096]
+0xc4,0x62,0x95,0xb4,0xa2,0x00,0xf0,0xff,0xff
+
+# ATT: {vex} vpmadd52luq 268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x91,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: {vex} vpmadd52luq 291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x91,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT: {vex} vpmadd52luq (%rip), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x91,0xb4,0x25,0x00,0x00,0x00,0x00
+
+# ATT: {vex} vpmadd52luq -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x91,0xb4,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT: {vex} vpmadd52luq 2032(%rcx), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rcx + 2032]
+0xc4,0x62,0x91,0xb4,0xa1,0xf0,0x07,0x00,0x00
+
+# ATT: {vex} vpmadd52luq -2048(%rdx), %xmm13, %xmm12
+# INTEL: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rdx - 2048]
+0xc4,0x62,0x91,0xb4,0xa2,0x00,0xf8,0xff,0xff
+
diff --git a/llvm/test/MC/X86/avx-ifma-att-32.s b/llvm/test/MC/X86/avx-ifma-att-32.s
new file mode 100644
index 0000000000000..cc0604a2f7d2d
--- /dev/null
+++ b/llvm/test/MC/X86/avx-ifma-att-32.s
@@ -0,0 +1,114 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxifma --show-encoding %s | FileCheck %s
+
+// CHECK: {vex} vpmadd52huq %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0xd4]
+ {vex} vpmadd52huq %ymm4, %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52huq %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0xd4]
+ {vex} vpmadd52huq %xmm4, %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52huq 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52huq 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52huq 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x94,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52huq 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52huq (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x10]
+ {vex} vpmadd52huq (%eax), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52huq -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpmadd52huq -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52huq 4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x91,0xe0,0x0f,0x00,0x00]
+ {vex} vpmadd52huq 4064(%ecx), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52huq -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x92,0x00,0xf0,0xff,0xff]
+ {vex} vpmadd52huq -4096(%edx), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52huq 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52huq 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52huq 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x94,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52huq 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52huq (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x10]
+ {vex} vpmadd52huq (%eax), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52huq -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpmadd52huq -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52huq 2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x91,0xf0,0x07,0x00,0x00]
+ {vex} vpmadd52huq 2032(%ecx), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52huq -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x92,0x00,0xf8,0xff,0xff]
+ {vex} vpmadd52huq -2048(%edx), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52luq %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0xd4]
+ {vex} vpmadd52luq %ymm4, %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52luq %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0xd4]
+ {vex} vpmadd52luq %xmm4, %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52luq 268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52luq 268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52luq 291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x94,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52luq 291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52luq (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x10]
+ {vex} vpmadd52luq (%eax), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52luq -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpmadd52luq -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52luq 4064(%ecx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x91,0xe0,0x0f,0x00,0x00]
+ {vex} vpmadd52luq 4064(%ecx), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52luq -4096(%edx), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x92,0x00,0xf0,0xff,0xff]
+ {vex} vpmadd52luq -4096(%edx), %ymm3, %ymm2
+
+// CHECK: {vex} vpmadd52luq 268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52luq 268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52luq 291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x94,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52luq 291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52luq (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x10]
+ {vex} vpmadd52luq (%eax), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52luq -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpmadd52luq -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52luq 2032(%ecx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x91,0xf0,0x07,0x00,0x00]
+ {vex} vpmadd52luq 2032(%ecx), %xmm3, %xmm2
+
+// CHECK: {vex} vpmadd52luq -2048(%edx), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x92,0x00,0xf8,0xff,0xff]
+ {vex} vpmadd52luq -2048(%edx), %xmm3, %xmm2
+
diff --git a/llvm/test/MC/X86/avx-ifma-att-64.s b/llvm/test/MC/X86/avx-ifma-att-64.s
new file mode 100644
index 0000000000000..0d7b60511083c
--- /dev/null
+++ b/llvm/test/MC/X86/avx-ifma-att-64.s
@@ -0,0 +1,114 @@
+// RUN: llvm-mc -triple=x86_64-unknown-unknown -mattr=+avxifma --show-encoding < %s | FileCheck %s
+
+// CHECK: {vex} vpmadd52huq %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x95,0xb5,0xe6]
+ {vex} vpmadd52huq %ymm14, %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52huq %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x91,0xb5,0xe6]
+ {vex} vpmadd52huq %xmm14, %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52huq 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x95,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52huq 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52huq 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x95,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52huq 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52huq (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0x25,0x00,0x00,0x00,0x00]
+ {vex} vpmadd52huq (%rip), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52huq -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpmadd52huq -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52huq 4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0xa1,0xe0,0x0f,0x00,0x00]
+ {vex} vpmadd52huq 4064(%rcx), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52huq -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0xa2,0x00,0xf0,0xff,0xff]
+ {vex} vpmadd52huq -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52huq 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x91,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52huq 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52huq 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x91,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52huq 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52huq (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0x25,0x00,0x00,0x00,0x00]
+ {vex} vpmadd52huq (%rip), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52huq -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpmadd52huq -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52huq 2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0xa1,0xf0,0x07,0x00,0x00]
+ {vex} vpmadd52huq 2032(%rcx), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52huq -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0xa2,0x00,0xf8,0xff,0xff]
+ {vex} vpmadd52huq -2048(%rdx), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52luq %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x95,0xb4,0xe6]
+ {vex} vpmadd52luq %ymm14, %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52luq %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x91,0xb4,0xe6]
+ {vex} vpmadd52luq %xmm14, %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52luq 268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x95,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52luq 268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52luq 291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x95,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52luq 291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52luq (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0x25,0x00,0x00,0x00,0x00]
+ {vex} vpmadd52luq (%rip), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52luq -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpmadd52luq -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52luq 4064(%rcx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0xa1,0xe0,0x0f,0x00,0x00]
+ {vex} vpmadd52luq 4064(%rcx), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52luq -4096(%rdx), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0xa2,0x00,0xf0,0xff,0xff]
+ {vex} vpmadd52luq -4096(%rdx), %ymm13, %ymm12
+
+// CHECK: {vex} vpmadd52luq 268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x91,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52luq 268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52luq 291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x91,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52luq 291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52luq (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0x25,0x00,0x00,0x00,0x00]
+ {vex} vpmadd52luq (%rip), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52luq -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpmadd52luq -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52luq 2032(%rcx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0xa1,0xf0,0x07,0x00,0x00]
+ {vex} vpmadd52luq 2032(%rcx), %xmm13, %xmm12
+
+// CHECK: {vex} vpmadd52luq -2048(%rdx), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0xa2,0x00,0xf8,0xff,0xff]
+ {vex} vpmadd52luq -2048(%rdx), %xmm13, %xmm12
+
diff --git a/llvm/test/MC/X86/avx-ifma-intel-32.s b/llvm/test/MC/X86/avx-ifma-intel-32.s
new file mode 100644
index 0000000000000..bba0c91a4b2c9
--- /dev/null
+++ b/llvm/test/MC/X86/avx-ifma-intel-32.s
@@ -0,0 +1,114 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxifma -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0xd4]
+ {vex} vpmadd52huq ymm2, ymm3, ymm4
+
+// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0xd4]
+ {vex} vpmadd52huq xmm2, xmm3, xmm4
+
+// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x94,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x10]
+ {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x91,0xe0,0x0f,0x00,0x00]
+ {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb5,0x92,0x00,0xf0,0xff,0xff]
+ {vex} vpmadd52huq ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x94,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x94,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x10]
+ {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x91,0xf0,0x07,0x00,0x00]
+ {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb5,0x92,0x00,0xf8,0xff,0xff]
+ {vex} vpmadd52huq xmm2, xmm3, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0xd4]
+ {vex} vpmadd52luq ymm2, ymm3, ymm4
+
+// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0xd4]
+ {vex} vpmadd52luq xmm2, xmm3, xmm4
+
+// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x94,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x10]
+ {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x14,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x91,0xe0,0x0f,0x00,0x00]
+ {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0xe5,0xb4,0x92,0x00,0xf0,0xff,0xff]
+ {vex} vpmadd52luq ymm2, ymm3, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x94,0xf4,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x94,0x87,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x10]
+ {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x14,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x91,0xf0,0x07,0x00,0x00]
+ {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0xe1,0xb4,0x92,0x00,0xf8,0xff,0xff]
+ {vex} vpmadd52luq xmm2, xmm3, xmmword ptr [edx - 2048]
+
diff --git a/llvm/test/MC/X86/avx-ifma-intel-64.s b/llvm/test/MC/X86/avx-ifma-intel-64.s
new file mode 100644
index 0000000000000..dcfa57f67ef38
--- /dev/null
+++ b/llvm/test/MC/X86/avx-ifma-intel-64.s
@@ -0,0 +1,114 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxifma -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x95,0xb5,0xe6]
+ {vex} vpmadd52huq ymm12, ymm13, ymm14
+
+// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x91,0xb5,0xe6]
+ {vex} vpmadd52huq xmm12, xmm13, xmm14
+
+// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x95,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x95,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0x25,0x00,0x00,0x00,0x00]
+ {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0xa1,0xe0,0x0f,0x00,0x00]
+ {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x95,0xb5,0xa2,0x00,0xf0,0xff,0xff]
+ {vex} vpmadd52huq ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x91,0xb5,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x91,0xb5,0xa4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0x25,0x00,0x00,0x00,0x00]
+ {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0xa1,0xf0,0x07,0x00,0x00]
+ {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x91,0xb5,0xa2,0x00,0xf8,0xff,0xff]
+ {vex} vpmadd52huq xmm12, xmm13, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x95,0xb4,0xe6]
+ {vex} vpmadd52luq ymm12, ymm13, ymm14
+
+// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x91,0xb4,0xe6]
+ {vex} vpmadd52luq xmm12, xmm13, xmm14
+
+// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x95,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x95,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0x25,0x00,0x00,0x00,0x00]
+ {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0x24,0x6d,0x00,0xfc,0xff,0xff]
+ {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0xa1,0xe0,0x0f,0x00,0x00]
+ {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0x62,0x95,0xb4,0xa2,0x00,0xf0,0xff,0xff]
+ {vex} vpmadd52luq ymm12, ymm13, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x91,0xb4,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x91,0xb4,0xa4,0x80,0x23,0x01,0x00,0x00]
+ {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0x25,0x00,0x00,0x00,0x00]
+ {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0x24,0x6d,0x00,0xfe,0xff,0xff]
+ {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0xa1,0xf0,0x07,0x00,0x00]
+ {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0x62,0x91,0xb4,0xa2,0x00,0xf8,0xff,0xff]
+ {vex} vpmadd52luq xmm12, xmm13, xmmword ptr [rdx - 2048]
+
More information about the llvm-commits
mailing list