[llvm] f77101e - [X86][AMX] Support AMX-MOVRS (#115151)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 11 23:05:47 PST 2024
Author: Malay Sanghi
Date: 2024-11-12T15:05:43+08:00
New Revision: f77101ea7913ab6a6b28ad03c152c615a89900f6
URL: https://github.com/llvm/llvm-project/commit/f77101ea7913ab6a6b28ad03c152c615a89900f6
DIFF: https://github.com/llvm/llvm-project/commit/f77101ea7913ab6a6b28ad03c152c615a89900f6.diff
LOG: [X86][AMX] Support AMX-MOVRS (#115151)
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
Added:
clang/lib/Headers/amxmovrsintrin.h
clang/lib/Headers/amxmovrstransposeintrin.h
clang/test/CodeGen/X86/amx_movrs.c
clang/test/CodeGen/X86/amx_movrs_api.c
clang/test/CodeGen/X86/amx_movrs_errors.c
clang/test/CodeGen/X86/amx_movrs_tranpose.c
clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Basic/BuiltinsX86_64.def
clang/include/clang/Driver/Options.td
clang/lib/Basic/Targets/X86.cpp
clang/lib/Basic/Targets/X86.h
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/Headers/CMakeLists.txt
clang/lib/Headers/immintrin.h
clang/lib/Sema/SemaX86.cpp
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/include/llvm/TargetParser/X86TargetParser.def
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86ExpandPseudo.cpp
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrAMX.td
llvm/lib/Target/X86/X86InstrPredicates.td
llvm/lib/Target/X86/X86LowerAMXType.cpp
llvm/lib/Target/X86/X86RegisterInfo.cpp
llvm/lib/TargetParser/Host.cpp
llvm/lib/TargetParser/X86TargetParser.cpp
Removed:
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4ef48bed58d95c..c8aa053986cc00 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -741,6 +741,7 @@ X86 Support
* Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
- Support ISA of ``AMX-FP8``.
- Support ISA of ``AMX-TRANSPOSE``.
+- Support ISA of ``AMX-MOVRS``.
- Support ISA of ``AMX-AVX512``.
- Support ISA of ``AMX-TF32``.
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 25c10d39df32e2..f853b4313dae07 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr")
// AMX internal builtin
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs")
TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs")
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
@@ -129,10 +131,15 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i",
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
@@ -147,6 +154,13 @@ TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+
+TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs")
TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1304ef3c5a228b..9fb7f8bb6489b0 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
+def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
+def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index dc85e9aa77cd3d..086b4415412e67 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXCOMPLEX = true;
} else if (Feature == "+amx-fp8") {
HasAMXFP8 = true;
+ } else if (Feature == "+amx-movrs") {
+ HasAMXMOVRS = true;
} else if (Feature == "+amx-transpose") {
HasAMXTRANSPOSE = true;
} else if (Feature == "+amx-avx512") {
@@ -957,6 +959,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_COMPLEX__");
if (HasAMXFP8)
Builder.defineMacro("__AMX_FP8__");
+ if (HasAMXMOVRS)
+ Builder.defineMacro("__AMX_MOVRS__");
if (HasAMXTRANSPOSE)
Builder.defineMacro("__AMX_TRANSPOSE__");
if (HasAMXAVX512)
@@ -1094,6 +1098,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("amx-fp16", true)
.Case("amx-fp8", true)
.Case("amx-int8", true)
+ .Case("amx-movrs", true)
.Case("amx-tf32", true)
.Case("amx-tile", true)
.Case("amx-transpose", true)
@@ -1216,6 +1221,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-fp16", HasAMXFP16)
.Case("amx-fp8", HasAMXFP8)
.Case("amx-int8", HasAMXINT8)
+ .Case("amx-movrs", HasAMXMOVRS)
.Case("amx-tf32", HasAMXTF32)
.Case("amx-tile", HasAMXTILE)
.Case("amx-transpose", HasAMXTRANSPOSE)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 04b1d5d33ea231..06a7eed8177cb2 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXBF16 = false;
bool HasAMXCOMPLEX = false;
bool HasAMXFP8 = false;
+ bool HasAMXMOVRS = false;
bool HasAMXTRANSPOSE = false;
bool HasAMXAVX512 = false;
bool HasAMXTF32 = false;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 430ac5626f89d7..0807542825f634 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17025,9 +17025,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
}
// Corresponding to intrisics which will return 2 tiles (tile0_tile1).
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
+ case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
Intrinsic::ID IID;
switch (BuiltinID) {
default:
@@ -17035,15 +17039,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
IID = Intrinsic::x86_t2rpntlvwz0_internal;
break;
+ case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
+ IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
+ break;
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
break;
+ case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
+ break;
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
IID = Intrinsic::x86_t2rpntlvwz1_internal;
break;
+ case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
+ IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
+ break;
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
break;
+ case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
+ break;
}
// Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index a094305bcec5e4..7227df93edece4 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -151,6 +151,8 @@ set(x86_files
amxfp16intrin.h
amxfp8intrin.h
amxintrin.h
+ amxmovrsintrin.h
+ amxmovrstransposeintrin.h
amxtf32intrin.h
amxtf32transposeintrin.h
amxtransposeintrin.h
diff --git a/clang/lib/Headers/amxmovrsintrin.h b/clang/lib/Headers/amxmovrsintrin.h
new file mode 100644
index 00000000000000..5fe2fdecb8b5dd
--- /dev/null
+++ b/clang/lib/Headers/amxmovrsintrin.h
@@ -0,0 +1,48 @@
+/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-------------------------------------------------------------------=== */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXMOVRSINTRIN_H
+#define __AMXMOVRSINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_MOVRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs")))
+
+#define _tile_loaddrs(dst, base, stride) \
+ __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \
+ (__SIZE_TYPE__)(stride))
+#define _tile_stream_loaddrs(dst, base, stride) \
+ __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \
+ (__SIZE_TYPE__)(stride))
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base,
+ __SIZE_TYPE__ stride) {
+ return __builtin_ia32_tileloaddrs64_internal(m, n, base,
+ (__SIZE_TYPE__)(stride));
+}
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base,
+ __SIZE_TYPE__ stride) {
+ return __builtin_ia32_tileloaddrst164_internal(m, n, base,
+ (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS
+__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+ dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride);
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs(
+ __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+ dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride);
+}
+#undef __DEFAULT_FN_ATTRS_MOVRS
+#endif /* __x86_64__ */
+#endif /* __AMXMOVRSINTRIN_H */
diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h
new file mode 100644
index 00000000000000..17a9f7506a0421
--- /dev/null
+++ b/clang/lib/Headers/amxmovrstransposeintrin.h
@@ -0,0 +1,200 @@
+/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
+#define __AMX_MOVRS_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-transpose,amx-movrs")))
+
+#define _tile_2rpntlvwz0rs(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
+#define _tile_2rpntlvwz0rst1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
+#define _tile_2rpntlvwz1rs(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
+#define _tile_2rpntlvwz1rst1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ // Use __tile1024i_1024a* to escape the alignment check in
+ // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+ __builtin_ia32_t2rpntlvwz0rs_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz0rst1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1rs_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1rst1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#endif /* __x86_64__ */
+#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */
\ No newline at end of file
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 87a502238ae162..f0dd7160ec7ff4 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -656,6 +656,15 @@ _storebe_i64(void * __P, long long __D) {
#include <amxtransposeintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__)
+#include <amxmovrsintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || \
+ (defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__))
+#include <amxmovrstransposeintrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
#include <amxavx512intrin.h>
#endif
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index d7c8ed351f410a..85b195491025d3 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -629,12 +629,18 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
return false;
case X86::BI__builtin_ia32_tileloadd64:
case X86::BI__builtin_ia32_tileloaddt164:
+ case X86::BI__builtin_ia32_tileloaddrs64:
+ case X86::BI__builtin_ia32_tileloaddrst164:
case X86::BI__builtin_ia32_tilestored64:
case X86::BI__builtin_ia32_tilezero:
case X86::BI__builtin_ia32_t2rpntlvwz0:
case X86::BI__builtin_ia32_t2rpntlvwz0t1:
case X86::BI__builtin_ia32_t2rpntlvwz1:
case X86::BI__builtin_ia32_t2rpntlvwz1t1:
+ case X86::BI__builtin_ia32_t2rpntlvwz0rst1:
+ case X86::BI__builtin_ia32_t2rpntlvwz1rs:
+ case X86::BI__builtin_ia32_t2rpntlvwz1rst1:
+ case X86::BI__builtin_ia32_t2rpntlvwz0rs:
case X86::BI__builtin_ia32_tcvtrowps2pbf16h:
case X86::BI__builtin_ia32_tcvtrowps2pbf16l:
case X86::BI__builtin_ia32_tcvtrowps2phh:
diff --git a/clang/test/CodeGen/X86/amx_movrs.c b/clang/test/CodeGen/X86/amx_movrs.c
new file mode 100755
index 00000000000000..4a8f001baafcea
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -target-feature +avx512f \
+// RUN: -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+#define STRIDE 32
+
+char buf[1024];
+
+void test_tile_loadd(short row, short col) {
+ // CHECK-LABEL: define dso_local void @test_tile_loadd(
+ // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+ // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+ _tile_loaddrs_internal(row, col, buf, STRIDE);
+}
+
+void test_tile_loaddt1(short row, short col) {
+ // CHECK-LABEL: define dso_local void @test_tile_loaddt1(
+ // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+ // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+ _tile_loaddrst1_internal(row, col, buf, STRIDE);
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_api.c b/clang/test/CodeGen/X86/amx_movrs_api.c
new file mode 100755
index 00000000000000..cf430adf140852
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_api.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+#define STRIDE 32
+
+char buf[1024];
+
+void test_tile_loadd(short row) {
+ // CHECK-LABEL: define dso_local void @test_tile_loadd(
+ // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+ // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+ __tile1024i a = {row, 8};
+ __tile_loaddrs(&a, buf, STRIDE);
+}
+
+void test_tile_loaddt1(short row) {
+ // CHECK-LABEL: define dso_local void @test_tile_loaddt1(
+ // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+ // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+ __tile1024i a = {row, 8};
+ __tile_stream_loaddrs(&a, buf, STRIDE);
+}
+
+void test_tile_loadd_macro(void *data) {
+ // CHECK-LABEL: define dso_local void @test_tile_loadd_macro(
+ // CHECK: call void @llvm.x86.tileloaddrs64(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}})
+ // CHECK: call void @llvm.x86.tileloaddrst164(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}})
+ _tile_loaddrs(4, data, STRIDE);
+ _tile_stream_loaddrs(2, data, STRIDE);
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c
new file mode 100755
index 00000000000000..495ea299236e2f
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_errors.c
@@ -0,0 +1,13 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -verify
+
+#include <immintrin.h>
+#include <stddef.h>
+
+char buf[1024];
+
+void test_tile_load() {
+ _tile_loaddrs(20, buf, 32); // expected-error {{argument value 20 is outside the valid range [0, 7]}}
+ _tile_stream_loaddrs(-1, buf, 20); // expected-error {{argument value 255 is outside the valid range [0, 7]}}
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c
new file mode 100755
index 00000000000000..192c153835e1e6
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_tranpose.c
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+char buf[2048];
+#define STRIDE 32
+
+// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rs_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+ _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rst1_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+ _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rs_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+ _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rst1_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+ _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1);
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
new file mode 100755
index 00000000000000..b174cc5067bf30
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+char buf[2048];
+#define STRIDE 32
+
+void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz0rs
+ // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz0rs(1, A, B);
+}
+
+void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz0rst1
+ // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz0rst1(1, A, B);
+}
+
+void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz1rs
+ // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz1rs(1, A, B);
+}
+
+void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz1rst1
+ // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz1rst1(1, A, B);
+}
+
+void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test__tile_2rpntlvwz0rs
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE);
+}
+
+void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test__tile_2rpntlvwz0rst1
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE);
+}
+
+void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test__tile_2rpntlvwz1rs
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE);
+}
+
+void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test__tile_2rpntlvwz1rst1
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE);
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
new file mode 100755
index 00000000000000..840b52bbb29bbf
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \
+// RUN: -verify
+
+#include <immintrin.h>
+#include <stddef.h>
+
+void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
+ _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
+ _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
+ _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
+ _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index b2d6f44b7927a9..fcb506e9ebbfcc 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5882,6 +5882,12 @@ let TargetPrefix = "x86" in {
def int_x86_tilestored64 : ClangBuiltin<"__builtin_ia32_tilestored64">,
Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
[ImmArg<ArgIndex<0>>]>;
+ def int_x86_tileloaddrs64 : ClangBuiltin<"__builtin_ia32_tileloaddrs64">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_tileloaddrst164 : ClangBuiltin<"__builtin_ia32_tileloaddrst164">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
def int_x86_tdpbssd : ClangBuiltin<"__builtin_ia32_tdpbssd">,
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
@@ -5952,6 +5958,20 @@ let TargetPrefix = "x86" in {
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+ // AMX-MORVS, AMX-TRANSPOSE
+ def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_t2rpntlvwz0rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rst1">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_t2rpntlvwz1rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rs">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_t2rpntlvwz1rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rst1">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+
// AMX-AVX512
def int_x86_tcvtrowd2ps : ClangBuiltin<"__builtin_ia32_tcvtrowd2ps">,
Intrinsic<[llvm_v16f32_ty], [llvm_i8_ty, llvm_i32_ty],
@@ -5986,6 +6006,16 @@ let TargetPrefix = "x86" in {
Intrinsic<[llvm_x86amx_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
[]>;
+ def int_x86_tileloaddrs64_internal :
+ ClangBuiltin<"__builtin_ia32_tileloaddrs64_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
+ def int_x86_tileloaddrst164_internal :
+ ClangBuiltin<"__builtin_ia32_tileloaddrst164_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
def int_x86_tdpbssd_internal :
ClangBuiltin<"__builtin_ia32_tdpbssd_internal">,
Intrinsic<[llvm_x86amx_ty],
@@ -6050,6 +6080,24 @@ let TargetPrefix = "x86" in {
llvm_x86amx_ty, llvm_x86amx_ty,
llvm_x86amx_ty], []>;
+ // AMX-MORVS, AMX-TRANSPOSE - internal intrinsics
+ def int_x86_t2rpntlvwz0rs_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ [IntrArgMemOnly, IntrReadMem]>;
+ def int_x86_t2rpntlvwz0rst1_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ [IntrArgMemOnly]>;
+ def int_x86_t2rpntlvwz1rs_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ [IntrArgMemOnly]>;
+ def int_x86_t2rpntlvwz1rst1_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ [IntrArgMemOnly]>;
+
def int_x86_t2rpntlvwz0_internal :
Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 3b643563775688..5cad8e117f4ee2 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -266,6 +266,7 @@ X86_FEATURE (MOVRS, "movrs")
X86_FEATURE (ZU, "zu")
X86_FEATURE (AMX_FP8, "amx-fp8")
X86_FEATURE (AMX_TRANSPOSE, "amx-transpose")
+X86_FEATURE (AMX_MOVRS, "amx-movrs")
X86_FEATURE (AMX_AVX512, "amx-avx512")
X86_FEATURE (AMX_TF32, "amx-tf32")
// These features aren't really CPU features, but the frontend can set them.
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 35bbffdb20942d..509632183dc016 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -273,6 +273,9 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true",
def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
"Support AMX-FP8 instructions",
[FeatureAMXTILE]>;
+def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true",
+ "Support AMX-MOVRS instructions",
+ [FeatureAMXTILE]>;
def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true",
"Support AMX amx-transpose instructions",
[FeatureAMXTILE]>;
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 4f045d78f75fb2..73ca4b09c0aa53 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -559,6 +559,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
}
case X86::PTILELOADDV:
case X86::PTILELOADDT1V:
+ case X86::PTILELOADDRSV:
+ case X86::PTILELOADDRST1V:
case X86::PTCVTROWD2PSrreV:
case X86::PTCVTROWD2PSrriV:
case X86::PTCVTROWPS2PBF16HrreV:
@@ -575,6 +577,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.removeOperand(i);
unsigned Opc;
switch (Opcode) {
+ case X86::PTILELOADDRSV:
+ Opc = X86::TILELOADDRS;
+ break;
+ case X86::PTILELOADDRST1V:
+ Opc = X86::TILELOADDRST1;
+ break;
case X86::PTILELOADDV:
Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
break;
@@ -719,7 +727,11 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::PT2RPNTLVWZ0V:
case X86::PT2RPNTLVWZ0T1V:
case X86::PT2RPNTLVWZ1V:
- case X86::PT2RPNTLVWZ1T1V: {
+ case X86::PT2RPNTLVWZ1T1V:
+ case X86::PT2RPNTLVWZ0RSV:
+ case X86::PT2RPNTLVWZ0RST1V:
+ case X86::PT2RPNTLVWZ1RSV:
+ case X86::PT2RPNTLVWZ1RST1V: {
for (unsigned i = 3; i > 0; --i)
MI.removeOperand(i);
unsigned Opc;
@@ -736,6 +748,18 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::PT2RPNTLVWZ1T1V:
Opc = X86::T2RPNTLVWZ1T1;
break;
+ case X86::PT2RPNTLVWZ0RSV:
+ Opc = X86::T2RPNTLVWZ0RS;
+ break;
+ case X86::PT2RPNTLVWZ0RST1V:
+ Opc = X86::T2RPNTLVWZ0RST1;
+ break;
+ case X86::PT2RPNTLVWZ1RSV:
+ Opc = X86::T2RPNTLVWZ1RS;
+ break;
+ case X86::PT2RPNTLVWZ1RST1V:
+ Opc = X86::T2RPNTLVWZ1RST1;
+ break;
default:
llvm_unreachable("Impossible Opcode!");
}
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8ad8641d1de485..72de0e0e8761f2 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -337,6 +337,10 @@ namespace {
case X86::PT2RPNTLVWZ0T1V:
case X86::PT2RPNTLVWZ1V:
case X86::PT2RPNTLVWZ1T1V:
+ case X86::PT2RPNTLVWZ0RSV:
+ case X86::PT2RPNTLVWZ0RST1V:
+ case X86::PT2RPNTLVWZ1RSV:
+ case X86::PT2RPNTLVWZ1RST1V:
return true;
}
for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
@@ -5157,6 +5161,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, Res);
return;
}
+ case Intrinsic::x86_tileloaddrs64_internal:
+ case Intrinsic::x86_tileloaddrst164_internal:
+ if (!Subtarget->hasAMXMOVRS())
+ break;
+ [[fallthrough]];
case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tileloaddt164_internal: {
if (!Subtarget->hasAMXTILE())
@@ -5164,9 +5173,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
auto *MFI =
CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
- unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
- ? X86::PTILELOADDV
- : X86::PTILELOADDT1V;
+ unsigned Opc;
+ switch (IntNo) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_tileloaddrs64_internal:
+ Opc = X86::PTILELOADDRSV;
+ break;
+ case Intrinsic::x86_tileloaddrst164_internal:
+ Opc = X86::PTILELOADDRST1V;
+ break;
+ case Intrinsic::x86_tileloadd64_internal:
+ Opc = X86::PTILELOADDV;
+ break;
+ case Intrinsic::x86_tileloaddt164_internal:
+ Opc = X86::PTILELOADDT1V;
+ break;
+ }
// _tile_loadd_internal(row, col, buf, STRIDE)
SDValue Base = Node->getOperand(4);
SDValue Scale = getI8Imm(1, dl);
@@ -5270,6 +5293,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, CNode);
return;
}
+ case Intrinsic::x86_tileloaddrs64:
+ case Intrinsic::x86_tileloaddrst164:
+ if (!Subtarget->hasAMXMOVRS())
+ break;
+ [[fallthrough]];
case Intrinsic::x86_tileloadd64:
case Intrinsic::x86_tileloaddt164:
case Intrinsic::x86_tilestored64: {
@@ -5282,7 +5310,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
switch (IntNo) {
default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
+ case Intrinsic::x86_tileloaddrs64:
+ Opc = X86::PTILELOADDRS;
+ break;
case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
+ case Intrinsic::x86_tileloaddrst164:
+ Opc = X86::PTILELOADDRST1;
+ break;
case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
}
// FIXME: Match displacement and scale.
@@ -5305,6 +5339,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, CNode);
return;
}
+ case Intrinsic::x86_t2rpntlvwz0rs:
+ case Intrinsic::x86_t2rpntlvwz0rst1:
+ case Intrinsic::x86_t2rpntlvwz1rs:
+ case Intrinsic::x86_t2rpntlvwz1rst1:
+ if (!Subtarget->hasAMXMOVRS())
+ break;
+ [[fallthrough]];
case Intrinsic::x86_t2rpntlvwz0:
case Intrinsic::x86_t2rpntlvwz0t1:
case Intrinsic::x86_t2rpntlvwz1:
@@ -5330,6 +5371,18 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::x86_t2rpntlvwz1t1:
Opc = X86::PT2RPNTLVWZ1T1;
break;
+ case Intrinsic::x86_t2rpntlvwz0rs:
+ Opc = X86::PT2RPNTLVWZ0RS;
+ break;
+ case Intrinsic::x86_t2rpntlvwz0rst1:
+ Opc = X86::PT2RPNTLVWZ0RST1;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1rs:
+ Opc = X86::PT2RPNTLVWZ1RS;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1rst1:
+ Opc = X86::PT2RPNTLVWZ1RST1;
+ break;
}
// FIXME: Match displacement and scale.
unsigned TIndex = Node->getConstantOperandVal(2);
@@ -5340,9 +5393,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
SDValue Chain = Node->getOperand(0);
- MachineSDNode *CNode;
SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
- CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
ReplaceNode(Node, CNode);
return;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c08efc9f7271e6..db04f3a48d4d03 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27345,6 +27345,10 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
+ case Intrinsic::x86_t2rpntlvwz0rs_internal:
+ case Intrinsic::x86_t2rpntlvwz0rst1_internal:
+ case Intrinsic::x86_t2rpntlvwz1rs_internal:
+ case Intrinsic::x86_t2rpntlvwz1rst1_internal:
case Intrinsic::x86_t2rpntlvwz0_internal:
case Intrinsic::x86_t2rpntlvwz0t1_internal:
case Intrinsic::x86_t2rpntlvwz1_internal:
@@ -27368,6 +27372,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
case Intrinsic::x86_t2rpntlvwz1t1_internal:
Opc = X86::PT2RPNTLVWZ1T1V;
break;
+ case Intrinsic::x86_t2rpntlvwz0rs_internal:
+ Opc = X86::PT2RPNTLVWZ0RSV;
+ break;
+ case Intrinsic::x86_t2rpntlvwz0rst1_internal:
+ Opc = X86::PT2RPNTLVWZ0RST1V;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1rs_internal:
+ Opc = X86::PT2RPNTLVWZ1RSV;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1rst1_internal:
+ Opc = X86::PT2RPNTLVWZ1RST1V;
+ break;
}
SDLoc DL(Op);
@@ -37533,6 +37549,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
return BB;
}
+ case X86::PTILELOADDRS:
+ case X86::PTILELOADDRST1:
case X86::PTILELOADD:
case X86::PTILELOADDT1:
case X86::PTILESTORED: {
@@ -37550,6 +37568,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
break;
#undef GET_EGPR_IF_ENABLED
+ case X86::PTILELOADDRS:
+ Opc = X86::TILELOADDRS;
+ break;
+ case X86::PTILELOADDRST1:
+ Opc = X86::TILELOADDRST1;
+ break;
}
MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
@@ -37590,6 +37614,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
+ case X86::PT2RPNTLVWZ0RS:
+ case X86::PT2RPNTLVWZ0RST1:
+ case X86::PT2RPNTLVWZ1RS:
+ case X86::PT2RPNTLVWZ1RST1:
case X86::PT2RPNTLVWZ0:
case X86::PT2RPNTLVWZ0T1:
case X86::PT2RPNTLVWZ1:
@@ -37611,6 +37639,18 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::PT2RPNTLVWZ1T1:
Opc = X86::T2RPNTLVWZ1T1;
break;
+ case X86::PT2RPNTLVWZ0RS:
+ Opc = X86::T2RPNTLVWZ0RS;
+ break;
+ case X86::PT2RPNTLVWZ0RST1:
+ Opc = X86::T2RPNTLVWZ0RST1;
+ break;
+ case X86::PT2RPNTLVWZ1RS:
+ Opc = X86::T2RPNTLVWZ1RS;
+ break;
+ case X86::PT2RPNTLVWZ1RST1:
+ Opc = X86::T2RPNTLVWZ1RST1;
+ break;
}
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 04527716e31627..059bfb4c70bcf9 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -370,6 +370,70 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
}
} // HasAMXTILE, HasAMXTRANSPOSE
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
+ def T2RPNTLVWZ0RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T_MAP5;
+ def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T_MAP5;
+ def T2RPNTLVWZ1RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T_MAP5, PD;
+ def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T_MAP5, PD;
+ let isPseudo = true in {
+ def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ def PT2RPNTLVWZ1RSV : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ }
+ let usesCustomInserter = 1 in {
+ def PT2RPNTLVWZ0RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+ def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+ def PT2RPNTLVWZ1RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+ def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+ }
+} // HasAMXMOVRS, HasAMXTRANSPOSE
+
+let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
+ def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src1),
+ "tileloaddrs\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T8, XD;
+ def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src1),
+ "tileloaddrst1\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T8, PD;
+
+ let isPseudo = true, mayLoad = 1 in {
+ def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3), []>;
+ def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3), []>;
+ }
+
+ let usesCustomInserter = 1, mayLoad = 1 in {
+ def PTILELOADDRS : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+ def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+ }
+} // HasAMXMOVRS, In64BitMode
+
multiclass m_tcvtrowd2ps {
let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
let SchedRW = [WriteSystem] in {
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index a9ec5f660ff1d8..eb2e93a94b197c 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -184,6 +184,7 @@ def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">;
def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">;
+def HasAMXMOVRS : Predicate<"Subtarget->hasAMXMOVRS()">;
def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">;
def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">;
def HasAMXTF32 : Predicate<"Subtarget->hasAMXTF32()">;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 0e74cfa75e9606..3dc69d4ecb0201 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -227,7 +227,13 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
case Intrinsic::x86_t2rpntlvwz1t1_internal:
case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tileloaddt164_internal:
- case Intrinsic::x86_tilestored64_internal: {
+ case Intrinsic::x86_tilestored64_internal:
+ case Intrinsic::x86_t2rpntlvwz0rs_internal:
+ case Intrinsic::x86_t2rpntlvwz0rst1_internal:
+ case Intrinsic::x86_t2rpntlvwz1rs_internal:
+ case Intrinsic::x86_t2rpntlvwz1rst1_internal:
+ case Intrinsic::x86_tileloaddrs64_internal:
+ case Intrinsic::x86_tileloaddrst164_internal: {
Row = II->getArgOperand(0);
Col = II->getArgOperand(1);
break;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 09418c9bb74d34..08f6f43c72638b 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1077,6 +1077,8 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
case X86::PTCMMIMFP16PSV:
case X86::PTCMMRLFP16PSV:
case X86::PTTRANSPOSEDV:
+ case X86::PTILELOADDRSV:
+ case X86::PTILELOADDRST1V:
case X86::PTMMULTF32PSV:
case X86::PTTMMULTF32PSV: {
MachineOperand &MO1 = MI->getOperand(1);
@@ -1088,7 +1090,11 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
case X86::PT2RPNTLVWZ0V:
case X86::PT2RPNTLVWZ0T1V:
case X86::PT2RPNTLVWZ1V:
- case X86::PT2RPNTLVWZ1T1V: {
+ case X86::PT2RPNTLVWZ1T1V:
+ case X86::PT2RPNTLVWZ0RSV:
+ case X86::PT2RPNTLVWZ0RST1V:
+ case X86::PT2RPNTLVWZ1RSV:
+ case X86::PT2RPNTLVWZ1RST1V: {
MachineOperand &MO1 = MI->getOperand(1);
MachineOperand &MO2 = MI->getOperand(2);
MachineOperand &MO3 = MI->getOperand(3);
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 140e565e1686f2..58ba2553633221 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1882,6 +1882,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave;
Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave;
+ Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave;
bool HasLeaf24 =
MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 6b53424833bd47..0da740743c9b7c 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -600,6 +600,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
+constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 =
FeatureAMX_TILE | FeatureAVX10_2_512;
constexpr FeatureBitset ImpliedFeaturesAMX_TF32 = FeatureAMX_TILE;
diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
new file mode 100755
index 00000000000000..da212a1850964e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s
+
+define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
+; CHECK-LABEL: test_amx_internal:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK-NEXT: movw %ax, %cx
+; CHECK-NEXT: movw %di, %ax
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: tileloaddrs (%rdx,%rsi), %tmm0
+; CHECK-NEXT: movl $64, %esi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: retq
+entry:
+ %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
+ %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
+ ret void
+}
+declare x86_amx @llvm.x86.tileloaddrs64.internal(i16, i16, ptr, i64)
+
+define void @test_amx_old(i16 %m, i16 %n, ptr %buf) {
+; CHECK-LABEL: test_amx_old:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $32, %eax
+; CHECK-NEXT: tileloaddrs (%rdx,%rax), %tmm2
+; CHECK-NEXT: retq
+entry:
+ call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32)
+ ret void
+}
+declare void @llvm.x86.tileloaddrs64(i8 immarg, ptr, i64)
+
+define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
+; CHECK-LABEL: test_amx_t1_internal:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK-NEXT: movw %ax, %cx
+; CHECK-NEXT: movw %di, %ax
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: tileloaddrst1 (%rdx,%rsi), %tmm0
+; CHECK-NEXT: movl $64, %esi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: retq
+entry:
+ %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
+ %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
+ ret void
+}
+declare x86_amx @llvm.x86.tileloaddrst164.internal(i16, i16, ptr, i64)
+
+define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) {
+; CHECK-LABEL: test_amx_t1_old:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $32, %eax
+; CHECK-NEXT: tileloaddrst1 (%rdx,%rax), %tmm2
+; CHECK-NEXT: retq
+entry:
+ call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32)
+ ret void
+}
+declare void @llvm.x86.tileloaddrst164(i8 immarg, ptr, i64)
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
new file mode 100755
index 00000000000000..146b69773eb186
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
+; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
+
+define void @test_amx(i64 %stride, i8* %addr1) #0 {
+; CHECK-LABEL: test_amx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0
+; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2
+; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0
+; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
+; CHECK-NEXT: retq
+ call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
+ call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
+ call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
+ call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride)
+ ret void
+}
+declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 )
+declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 )
+declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 )
+declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 )
+
+define void @test_amx2(i8* %base, i64 %stride) #0 {
+; O0-LABEL: test_amx2:
+; O0: # %bb.0:
+; O0-NEXT: xorps %xmm0, %xmm0
+; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw $8, %ax
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
+; O0-NEXT: movw $8, %ax
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4
+; O0-NEXT: movw $8, %ax
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
+; O0-NEXT: tilerelease
+; O0-NEXT: retq
+;
+; O2-LABEL: test_amx2:
+; O2: # %bb.0:
+; O2-NEXT: xorps %xmm0, %xmm0
+; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; O2-NEXT: movw $8, %ax
+; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4
+; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
+; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4
+; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
+; O2-NEXT: tilerelease
+; O2-NEXT: retq
+ call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ ret void
+}
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64)
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
new file mode 100755
index 00000000000000..6df44c87d2332f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
@@ -0,0 +1,98 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL
+
+# ATT: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz0rs 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64]
+0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40
+
+# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
+0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz0rst1 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64]
+0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40
+
+# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz1rs 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64]
+0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40
+
+# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
+0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz1rst1 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64]
+0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40
+
+# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: tileloaddrs 268435456(%rbp,%r14,8), %tmm6
+# INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: tileloaddrs 291(%r8,%rax,4), %tmm3
+# INTEL: tileloaddrs tmm3, [r8 + 4*rax + 291]
+0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT: tileloaddrs 64(%rbx), %tmm6
+# INTEL: tileloaddrs tmm6, [rbx + 64]
+0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40
+
+# ATT: tileloaddrs -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrs tmm3, [2*rbp - 32]
+0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6
+# INTEL: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: tileloaddrst1 291(%r8,%rax,4), %tmm3
+# INTEL: tileloaddrst1 tmm3, [r8 + 4*rax + 291]
+0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT: tileloaddrst1 64(%rbx), %tmm6
+# INTEL: tileloaddrst1 tmm6, [rbx + 64]
+0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40
+
+# ATT: tileloaddrst1 -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
+0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
new file mode 100755
index 00000000000000..d780ad4f0e3691
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
@@ -0,0 +1,89 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
+ t2rpntlvwz0rs 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
+ t2rpntlvwz0rst1 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
+ t2rpntlvwz1rs 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
+ t2rpntlvwz1rst1 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ tileloaddrs 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: tileloaddrs 291(%r8,%rax,4), %tmm3
+// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+ tileloaddrs 291(%r8,%rax,4), %tmm3
+
+// CHECK: tileloaddrs -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+ tileloaddrs -32(,%rbp,2), %tmm3
+
+// CHECK: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ tileloaddrst1 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: tileloaddrst1 291(%r8,%rax,4), %tmm3
+// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+ tileloaddrst1 291(%r8,%rax,4), %tmm3
+
+// CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+ tileloaddrst1 -32(,%rbp,2), %tmm3
\ No newline at end of file
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
new file mode 100755
index 00000000000000..ccc7ac51a98a44
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
@@ -0,0 +1,97 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
+ t2rpntlvwz0rs tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
+ t2rpntlvwz0rst1 tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
+ t2rpntlvwz1rs tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
+ t2rpntlvwz1rst1 tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+
+// CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: tileloaddrs tmm3, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+ tileloaddrs tmm3, [r8 + 4*rax + 291]
+
+// CHECK: tileloaddrs tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40]
+ tileloaddrs tmm6, [rbx + 64]
+
+// CHECK: tileloaddrs tmm3, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+ tileloaddrs tmm3, [2*rbp - 32]
+
+// CHECK: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: tileloaddrst1 tmm3, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+ tileloaddrst1 tmm3, [r8 + 4*rax + 291]
+
+// CHECK: tileloaddrst1 tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40]
+ tileloaddrst1 tmm6, [rbx + 64]
+
+// CHECK: tileloaddrst1 tmm3, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+ tileloaddrst1 tmm3, [2*rbp - 32]
More information about the llvm-commits
mailing list