[clang] [llvm] [X86][AMX] Support AMX-MOVRS (PR #115151)
Malay Sanghi via cfe-commits
cfe-commits at lists.llvm.org
Mon Nov 11 20:47:16 PST 2024
https://github.com/MalaySanghi updated https://github.com/llvm/llvm-project/pull/115151
>From 3b6510da8fb3b9709839ea0c102355879b11aa6d Mon Sep 17 00:00:00 2001
From: Malay Sanghi <malay.sanghi at intel.com>
Date: Tue, 5 Nov 2024 13:37:54 +0800
Subject: [PATCH 1/4] [X86][AMX] Support AMX-MOVRS
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
clang/include/clang/Basic/BuiltinsX86_64.def | 14 ++
clang/include/clang/Driver/Options.td | 2 +
clang/lib/Basic/Targets/X86.cpp | 6 +
clang/lib/Basic/Targets/X86.h | 1 +
clang/lib/CodeGen/CGBuiltin.cpp | 18 +-
clang/lib/Headers/CMakeLists.txt | 1 +
clang/lib/Headers/amxmovrsintrin.h | 48 +++++
clang/lib/Headers/amxtransposeintrin.h | 177 ++++++++++++++++++
clang/lib/Headers/immintrin.h | 4 +
clang/lib/Sema/SemaX86.cpp | 6 +
clang/test/CodeGen/X86/amx_movrs.c | 25 +++
clang/test/CodeGen/X86/amx_movrs_api.c | 34 ++++
clang/test/CodeGen/X86/amx_movrs_errors.c | 14 ++
clang/test/CodeGen/X86/amx_movrs_tranpose.c | 53 ++++++
.../test/CodeGen/X86/amx_movrs_tranpose_api.c | 81 ++++++++
.../CodeGen/X86/amx_movrs_transpose_errors.c | 22 +++
llvm/include/llvm/IR/IntrinsicsX86.td | 48 +++++
llvm/lib/Target/X86/X86.td | 3 +
llvm/lib/Target/X86/X86ExpandPseudo.cpp | 35 ++++
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 109 ++++++++++-
llvm/lib/Target/X86/X86ISelLowering.cpp | 81 ++++++++
llvm/lib/Target/X86/X86InstrAMX.td | 91 +++++++++
llvm/lib/Target/X86/X86InstrInfo.cpp | 1 +
llvm/lib/Target/X86/X86InstrPredicates.td | 1 +
llvm/lib/Target/X86/X86LowerAMXType.cpp | 8 +-
llvm/lib/Target/X86/X86RegisterInfo.cpp | 10 +-
llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll | 108 +++++++++++
.../X86/amx_movrs_transpose_intrinsics.ll | 92 +++++++++
.../Disassembler/X86/AMX/x86-64-amx-movrs.txt | 98 ++++++++++
llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s | 89 +++++++++
llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s | 97 ++++++++++
31 files changed, 1371 insertions(+), 6 deletions(-)
create mode 100644 clang/lib/Headers/amxmovrsintrin.h
create mode 100755 clang/test/CodeGen/X86/amx_movrs.c
create mode 100755 clang/test/CodeGen/X86/amx_movrs_api.c
create mode 100755 clang/test/CodeGen/X86/amx_movrs_errors.c
create mode 100755 clang/test/CodeGen/X86/amx_movrs_tranpose.c
create mode 100755 clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
create mode 100755 clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
create mode 100755 llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
create mode 100755 llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
create mode 100755 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
create mode 100755 llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
create mode 100755 llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index d95e8455a304b6..98235023bddc7b 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr")
// AMX internal builtin
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs")
TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs")
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
@@ -129,15 +131,27 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i",
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+
+TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs")
TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 805b79491e6ea4..c36adb673dd4ea 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
+def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
+def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index d7d3adef42c79a..0ddc1ac4c47f22 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXCOMPLEX = true;
} else if (Feature == "+amx-fp8") {
HasAMXFP8 = true;
+ } else if (Feature == "+amx-movrs") {
+ HasAMXMOVRS = true;
} else if (Feature == "+amx-transpose") {
HasAMXTRANSPOSE = true;
} else if (Feature == "+cmpccxadd") {
@@ -953,6 +955,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_COMPLEX__");
if (HasAMXFP8)
Builder.defineMacro("__AMX_FP8__");
+ if (HasAMXMOVRS)
+ Builder.defineMacro("__AMX_MOVRS__");
if (HasAMXTRANSPOSE)
Builder.defineMacro("__AMX_TRANSPOSE__");
if (HasCMPCCXADD)
@@ -1085,6 +1089,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("amx-fp16", true)
.Case("amx-fp8", true)
.Case("amx-int8", true)
+ .Case("amx-movrs", true)
.Case("amx-tile", true)
.Case("amx-transpose", true)
.Case("avx", true)
@@ -1205,6 +1210,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-fp16", HasAMXFP16)
.Case("amx-fp8", HasAMXFP8)
.Case("amx-int8", HasAMXINT8)
+ .Case("amx-movrs", HasAMXMOVRS)
.Case("amx-tile", HasAMXTILE)
.Case("amx-transpose", HasAMXTRANSPOSE)
.Case("avx", SSELevel >= AVX)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index e2eba63b992355..54a078d2f137b6 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXBF16 = false;
bool HasAMXCOMPLEX = false;
bool HasAMXFP8 = false;
+ bool HasAMXMOVRS = false;
bool HasAMXTRANSPOSE = false;
bool HasSERIALIZE = false;
bool HasTSXLDTRK = false;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 34fedd67114751..02ee0132bbb5eb 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16996,9 +16996,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
}
// Corresponding to intrisics which will return 2 tiles (tile0_tile1).
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
+ case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
Intrinsic::ID IID;
switch (BuiltinID) {
default:
@@ -17006,15 +17010,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
IID = Intrinsic::x86_t2rpntlvwz0_internal;
break;
+ case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
+ IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
+ break;
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
break;
+ case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
+ break;
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
IID = Intrinsic::x86_t2rpntlvwz1_internal;
break;
+ case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
+ IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
+ break;
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
break;
+ case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
+ break;
}
// Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 67242cd4d981bc..a50cf01eac6fef 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -151,6 +151,7 @@ set(x86_files
amxfp8intrin.h
amxintrin.h
amxtransposeintrin.h
+ amxmovrsintrin.h
avx10_2_512bf16intrin.h
avx10_2_512convertintrin.h
avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxmovrsintrin.h b/clang/lib/Headers/amxmovrsintrin.h
new file mode 100644
index 00000000000000..5fe2fdecb8b5dd
--- /dev/null
+++ b/clang/lib/Headers/amxmovrsintrin.h
@@ -0,0 +1,48 @@
+/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-------------------------------------------------------------------=== */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXMOVRSINTRIN_H
+#define __AMXMOVRSINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_MOVRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs")))
+
+#define _tile_loaddrs(dst, base, stride) \
+ __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \
+ (__SIZE_TYPE__)(stride))
+#define _tile_stream_loaddrs(dst, base, stride) \
+ __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \
+ (__SIZE_TYPE__)(stride))
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base,
+ __SIZE_TYPE__ stride) {
+ return __builtin_ia32_tileloaddrs64_internal(m, n, base,
+ (__SIZE_TYPE__)(stride));
+}
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base,
+ __SIZE_TYPE__ stride) {
+ return __builtin_ia32_tileloaddrst164_internal(m, n, base,
+ (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS
+__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+ dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride);
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs(
+ __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+ dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride);
+}
+#undef __DEFAULT_FN_ATTRS_MOVRS
+#endif /* __x86_64__ */
+#endif /* __AMXMOVRSINTRIN_H */
diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
index b3fa37d766c45b..086c9a75222ca1 100644
--- a/clang/lib/Headers/amxtransposeintrin.h
+++ b/clang/lib/Headers/amxtransposeintrin.h
@@ -17,6 +17,9 @@
#define __DEFAULT_FN_ATTRS_TRANSPOSE \
__attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+#define __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-transpose,amx-movrs")))
#define _tile_2rpntlvwz0(tdst, base, stride) \
__builtin_ia32_t2rpntlvwz0(tdst, base, stride)
@@ -26,6 +29,15 @@
__builtin_ia32_t2rpntlvwz1(tdst, base, stride)
#define _tile_2rpntlvwz1t1(tdst, base, stride) \
__builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+// MOVRS versions
+#define _tile_2rpntlvwz0rs(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
+#define _tile_2rpntlvwz0rst1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
+#define _tile_2rpntlvwz1rs(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
+#define _tile_2rpntlvwz1rst1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
/// Transpose 32-bit elements from \a src and write the result to \a dst.
///
@@ -101,6 +113,45 @@ _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
return __builtin_ia32_ttransposed_internal(m, n, src);
}
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+_tile_2rpntlvwz0rs_internal(unsigned short row, unsigned short col0,
+ unsigned short col1, _tile1024i *dst0,
+ _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ // Use __tile1024i_1024a* to escape the alignment check in
+ // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+ __builtin_ia32_t2rpntlvwz0rs_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+_tile_2rpntlvwz0rst1_internal(unsigned short row, unsigned short col0,
+ unsigned short col1, _tile1024i *dst0,
+ _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz0rst1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+_tile_2rpntlvwz1rs_internal(unsigned short row, unsigned short col0,
+ unsigned short col1, _tile1024i *dst0,
+ _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1rs_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+_tile_2rpntlvwz1rst1_internal(unsigned short row, unsigned short col0,
+ unsigned short col1, _tile1024i *dst0,
+ _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1rst1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
/// Converts a pair of tiles from memory into VNNI format, and places the
/// results in a pair of destinations specified by dst. The pair of tiles
/// in memory is specified via a tsib; the second tile is after the first
@@ -229,6 +280,131 @@ static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
&dst1->tile, base, stride);
}
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
/// Transpose 32-bit elements from src and write the result to dst.
///
/// \headerfile <immintrin.h>
@@ -244,5 +420,6 @@ static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
}
+#undef __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
#endif /* __x86_64__ */
#endif /* __AMX_TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 4bf7eac4195eec..5035f02d889e72 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -656,6 +656,10 @@ _storebe_i64(void * __P, long long __D) {
#include <amxtransposeintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__)
+#include <amxmovrsintrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include <avx512vp2intersectintrin.h>
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index ef878d16d445fd..4d3b0292a56a98 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -629,12 +629,18 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
return false;
case X86::BI__builtin_ia32_tileloadd64:
case X86::BI__builtin_ia32_tileloaddt164:
+ case X86::BI__builtin_ia32_tileloaddrs64:
+ case X86::BI__builtin_ia32_tileloaddrst164:
case X86::BI__builtin_ia32_tilestored64:
case X86::BI__builtin_ia32_tilezero:
case X86::BI__builtin_ia32_t2rpntlvwz0:
case X86::BI__builtin_ia32_t2rpntlvwz0t1:
case X86::BI__builtin_ia32_t2rpntlvwz1:
case X86::BI__builtin_ia32_t2rpntlvwz1t1:
+ case X86::BI__builtin_ia32_t2rpntlvwz0rst1:
+ case X86::BI__builtin_ia32_t2rpntlvwz1rs:
+ case X86::BI__builtin_ia32_t2rpntlvwz1rst1:
+ case X86::BI__builtin_ia32_t2rpntlvwz0rs:
return CheckBuiltinTileArgumentsRange(TheCall, 0);
case X86::BI__builtin_ia32_tdpbssd:
case X86::BI__builtin_ia32_tdpbsud:
diff --git a/clang/test/CodeGen/X86/amx_movrs.c b/clang/test/CodeGen/X86/amx_movrs.c
new file mode 100755
index 00000000000000..4a8f001baafcea
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -target-feature +avx512f \
+// RUN: -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+#define STRIDE 32
+
+char buf[1024];
+
+void test_tile_loadd(short row, short col) {
+ // CHECK-LABEL: define dso_local void @test_tile_loadd(
+ // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+ // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+ _tile_loaddrs_internal(row, col, buf, STRIDE);
+}
+
+void test_tile_loaddt1(short row, short col) {
+ // CHECK-LABEL: define dso_local void @test_tile_loaddt1(
+ // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+ // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+ _tile_loaddrst1_internal(row, col, buf, STRIDE);
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_api.c b/clang/test/CodeGen/X86/amx_movrs_api.c
new file mode 100755
index 00000000000000..cf430adf140852
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_api.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+#define STRIDE 32
+
+char buf[1024];
+
+void test_tile_loadd(short row) {
+ // CHECK-LABEL: define dso_local void @test_tile_loadd(
+ // CHECK: call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+ // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+ __tile1024i a = {row, 8};
+ __tile_loaddrs(&a, buf, STRIDE);
+}
+
+void test_tile_loaddt1(short row) {
+ // CHECK-LABEL: define dso_local void @test_tile_loaddt1(
+ // CHECK: call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+ // CHECK-NEXT: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+ __tile1024i a = {row, 8};
+ __tile_stream_loaddrs(&a, buf, STRIDE);
+}
+
+void test_tile_loadd_macro(void *data) {
+ // CHECK-LABEL: define dso_local void @test_tile_loadd_macro(
+ // CHECK: call void @llvm.x86.tileloaddrs64(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}})
+ // CHECK: call void @llvm.x86.tileloaddrst164(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}})
+ _tile_loaddrs(4, data, STRIDE);
+ _tile_stream_loaddrs(2, data, STRIDE);
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c
new file mode 100755
index 00000000000000..bac7d962f5cb5c
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_errors.c
@@ -0,0 +1,14 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-reduce -target-feature +amx-memory \
+// RUN: -target-feature +amx-format -target-feature +amx-element -emit-llvm -verify
+
+#include <immintrin.h>
+#include <stddef.h>
+
+char buf[1024];
+
+void test_tile_load() {
+ _tile_loaddrs(20, buf, 32); // expected-error {{argument value 20 is outside the valid range [0, 7]}}
+ _tile_stream_loaddrs(-1, buf, 20); // expected-error {{argument value 255 is outside the valid range [0, 7]}}
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c
new file mode 100755
index 00000000000000..192c153835e1e6
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_tranpose.c
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+char buf[2048];
+#define STRIDE 32
+
+// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rs_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+ _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz0rst1_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+ _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rs_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+ _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1);
+}
+
+// CHECK-LABEL: define dso_local void @test_tile_2rpntlvwz1rst1_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+ _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1);
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
new file mode 100755
index 00000000000000..b174cc5067bf30
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+char buf[2048];
+#define STRIDE 32
+
+void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz0rs
+ // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz0rs(1, A, B);
+}
+
+void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz0rst1
+ // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz0rst1(1, A, B);
+}
+
+void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz1rs
+ // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz1rs(1, A, B);
+}
+
+void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz1rst1
+ // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz1rst1(1, A, B);
+}
+
+void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test__tile_2rpntlvwz0rs
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE);
+}
+
+void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test__tile_2rpntlvwz0rst1
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE);
+}
+
+void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test__tile_2rpntlvwz1rs
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE);
+}
+
+void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test__tile_2rpntlvwz1rst1
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE);
+}
diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
new file mode 100755
index 00000000000000..c8846b36ffa874
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \
+// RUN: -emit-llvm -verify
+
+#include <immintrin.h>
+#include <stddef.h>
+
+void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
+ _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
+ _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
+ _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
+ _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index c42397024e45a7..d37bda13c3c4ac 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5882,6 +5882,12 @@ let TargetPrefix = "x86" in {
def int_x86_tilestored64 : ClangBuiltin<"__builtin_ia32_tilestored64">,
Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
[ImmArg<ArgIndex<0>>]>;
+ def int_x86_tileloaddrs64 : ClangBuiltin<"__builtin_ia32_tileloaddrs64">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_tileloaddrst164 : ClangBuiltin<"__builtin_ia32_tileloaddrst164">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
def int_x86_tdpbssd : ClangBuiltin<"__builtin_ia32_tdpbssd">,
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
@@ -5952,6 +5958,20 @@ let TargetPrefix = "x86" in {
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+ // AMX-MORVS, AMX-TRANSPOSE
+ def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_t2rpntlvwz0rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rst1">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_t2rpntlvwz1rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rs">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_t2rpntlvwz1rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rst1">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+
// AMX - internal intrinsics
def int_x86_ldtilecfg_internal :
ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">,
@@ -5966,6 +5986,16 @@ let TargetPrefix = "x86" in {
Intrinsic<[llvm_x86amx_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
[]>;
+ def int_x86_tileloaddrs64_internal :
+ ClangBuiltin<"__builtin_ia32_tileloaddrs64_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
+ def int_x86_tileloaddrst164_internal :
+ ClangBuiltin<"__builtin_ia32_tileloaddrst164_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
def int_x86_tdpbssd_internal :
ClangBuiltin<"__builtin_ia32_tdpbssd_internal">,
Intrinsic<[llvm_x86amx_ty],
@@ -6030,6 +6060,24 @@ let TargetPrefix = "x86" in {
llvm_x86amx_ty, llvm_x86amx_ty,
llvm_x86amx_ty], []>;
+ // AMX-MORVS, AMX-TRANSPOSE - internal intrinsics
+ def int_x86_t2rpntlvwz0rs_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ [IntrArgMemOnly, IntrReadMem]>;
+ def int_x86_t2rpntlvwz0rst1_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ [IntrArgMemOnly]>;
+ def int_x86_t2rpntlvwz1rs_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ [IntrArgMemOnly]>;
+ def int_x86_t2rpntlvwz1rst1_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ [IntrArgMemOnly]>;
+
def int_x86_t2rpntlvwz0_internal :
Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 160e7c0fc0310a..dfeffae6dec4f5 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -273,6 +273,9 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true",
def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
"Support AMX-FP8 instructions",
[FeatureAMXTILE]>;
+def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true",
+ "Support AMX-MOVRS instructions",
+ [FeatureAMXTILE]>;
def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true",
"Support AMX amx-transpose instructions",
[FeatureAMXTILE]>;
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index f832955d1202fa..94072502c829a4 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -558,6 +558,15 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.setDesc(TII->get(GET_EGPR_IF_ENABLED(X86::LDTILECFG)));
return true;
}
+ case X86::PTILELOADDRSV:
+ case X86::PTILELOADDRST1V: {
+ for (unsigned i = 2; i > 0; --i)
+ MI.removeOperand(i);
+ unsigned Opc =
+ Opcode == X86::PTILELOADDRSV ? X86::TILELOADDRS : X86::TILELOADDRST1;
+ MI.setDesc(TII->get(Opc));
+ return true;
+ }
case X86::PTILELOADDV:
case X86::PTILELOADDT1V: {
for (unsigned i = 2; i > 0; --i)
@@ -687,6 +696,32 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.setDesc(TII->get(Opc));
return true;
}
+ case X86::PT2RPNTLVWZ0RSV:
+ case X86::PT2RPNTLVWZ0RST1V:
+ case X86::PT2RPNTLVWZ1RSV:
+ case X86::PT2RPNTLVWZ1RST1V: {
+ for (unsigned i = 3; i > 0; --i)
+ MI.removeOperand(i);
+ unsigned Opc;
+ switch (Opcode) {
+ case X86::PT2RPNTLVWZ0RSV:
+ Opc = X86::T2RPNTLVWZ0RS;
+ break;
+ case X86::PT2RPNTLVWZ0RST1V:
+ Opc = X86::T2RPNTLVWZ0RST1;
+ break;
+ case X86::PT2RPNTLVWZ1RSV:
+ Opc = X86::T2RPNTLVWZ1RS;
+ break;
+ case X86::PT2RPNTLVWZ1RST1V:
+ Opc = X86::T2RPNTLVWZ1RST1;
+ break;
+ default:
+ llvm_unreachable("Impossible Opcode!");
+ }
+ MI.setDesc(TII->get(Opc));
+ return true;
+ }
case X86::PTTRANSPOSEDV: {
for (int i = 2; i > 0; --i)
MI.removeOperand(i);
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index aea86c280e2f99..b2b18c1cf45576 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -338,6 +338,10 @@ namespace {
case X86::PT2RPNTLVWZ0T1V:
case X86::PT2RPNTLVWZ1V:
case X86::PT2RPNTLVWZ1T1V:
+ case X86::PT2RPNTLVWZ0RSV:
+ case X86::PT2RPNTLVWZ0RST1V:
+ case X86::PT2RPNTLVWZ1RSV:
+ case X86::PT2RPNTLVWZ1RST1V:
return true;
}
for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
@@ -5189,6 +5193,33 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, CNode);
return;
}
+ case Intrinsic::x86_tileloaddrs64_internal:
+ case Intrinsic::x86_tileloaddrst164_internal: {
+ if (!Subtarget->hasAMXMOVRS())
+ break;
+ unsigned Opc = IntNo == Intrinsic::x86_tileloaddrs64_internal
+ ? X86::PTILELOADDRSV
+ : X86::PTILELOADDRST1V;
+ // _tile_loadd_internal(row, col, buf, STRIDE)
+ SDValue Base = Node->getOperand(4);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(5);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Base,
+ Scale,
+ Index,
+ Disp,
+ Segment,
+ Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
}
break;
}
@@ -5307,6 +5338,44 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, CNode);
return;
}
+ case Intrinsic::x86_tileloaddrs64:
+ case Intrinsic::x86_tileloaddrst164: {
+ if (!Subtarget->hasAMXMOVRS())
+ break;
+ auto *MFI =
+ CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
+ unsigned Opc;
+ switch (IntNo) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_tileloaddrs64:
+ Opc = X86::PTILELOADDRS;
+ break;
+ case Intrinsic::x86_tileloaddrst164:
+ Opc = X86::PTILELOADDRST1;
+ break;
+ }
+ // FIXME: Match displacement and scale.
+ unsigned TIndex = Node->getConstantOperandVal(2);
+ SDValue TReg = getI8Imm(TIndex, dl);
+ SDValue Base = Node->getOperand(3);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(4);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ if (Opc == X86::PTILESTORED) {
+ SDValue Ops[] = {Base, Scale, Index, Disp, Segment, TReg, Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ } else {
+ SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ }
+ ReplaceNode(Node, CNode);
+ return;
+ }
case Intrinsic::x86_t2rpntlvwz0:
case Intrinsic::x86_t2rpntlvwz0t1:
case Intrinsic::x86_t2rpntlvwz1:
@@ -5342,9 +5411,45 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
SDValue Segment = CurDAG->getRegister(0, MVT::i16);
SDValue Chain = Node->getOperand(0);
- MachineSDNode *CNode;
SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
- CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ case Intrinsic::x86_t2rpntlvwz0rs:
+ case Intrinsic::x86_t2rpntlvwz0rst1:
+ case Intrinsic::x86_t2rpntlvwz1rs:
+ case Intrinsic::x86_t2rpntlvwz1rst1: {
+ if (!Subtarget->hasAMXTRANSPOSE() || !Subtarget->hasAMXMOVRS())
+ break;
+ unsigned Opc;
+ switch (IntNo) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_t2rpntlvwz0rs:
+ Opc = X86::PT2RPNTLVWZ0RS;
+ break;
+ case Intrinsic::x86_t2rpntlvwz0rst1:
+ Opc = X86::PT2RPNTLVWZ0RST1;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1rs:
+ Opc = X86::PT2RPNTLVWZ1RS;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1rst1:
+ Opc = X86::PT2RPNTLVWZ1RST1;
+ break;
+ }
+ // FIXME: Match displacement and scale.
+ unsigned TIndex = Node->getConstantOperandVal(2);
+ SDValue TReg = getI8Imm(TIndex, dl);
+ SDValue Base = Node->getOperand(3);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(4);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue Chain = Node->getOperand(0);
+ SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
+ MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
ReplaceNode(Node, CNode);
return;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0ae814d0ca3bb4..e9a3b0b675564f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27291,6 +27291,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
+ case Intrinsic::x86_t2rpntlvwz0rs_internal:
+ case Intrinsic::x86_t2rpntlvwz0rst1_internal:
+ case Intrinsic::x86_t2rpntlvwz1rs_internal:
+ case Intrinsic::x86_t2rpntlvwz1rst1_internal:
+ if (!Subtarget.hasAMXTRANSPOSE() || !Subtarget.hasAMXMOVRS())
+ break;
+ [[fallthrough]];
case Intrinsic::x86_t2rpntlvwz0_internal:
case Intrinsic::x86_t2rpntlvwz0t1_internal:
case Intrinsic::x86_t2rpntlvwz1_internal:
@@ -27316,6 +27323,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
case Intrinsic::x86_t2rpntlvwz1t1_internal:
Opc = X86::PT2RPNTLVWZ1T1V;
break;
+ case Intrinsic::x86_t2rpntlvwz0rs_internal:
+ Opc = X86::PT2RPNTLVWZ0RSV;
+ break;
+ case Intrinsic::x86_t2rpntlvwz0rst1_internal:
+ Opc = X86::PT2RPNTLVWZ0RST1V;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1rs_internal:
+ Opc = X86::PT2RPNTLVWZ1RSV;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1rst1_internal:
+ Opc = X86::PT2RPNTLVWZ1RST1V;
+ break;
}
SDLoc DL(Op);
@@ -37553,6 +37572,35 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
+ case X86::PTILELOADDRS:
+ case X86::PTILELOADDRST1: {
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("illegal opcode!");
+ case X86::PTILELOADDRS:
+ Opc = X86::TILELOADDRS;
+ break;
+ case X86::PTILELOADDRST1:
+ Opc = X86::TILELOADDRST1;
+ break;
+ }
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
+ unsigned CurOp = 0;
+ if (Opc != X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Define);
+ MIB.add(MI.getOperand(CurOp++)); // base
+ MIB.add(MI.getOperand(CurOp++)); // scale
+ MIB.add(MI.getOperand(CurOp++)); // index -- stride
+ MIB.add(MI.getOperand(CurOp++)); // displacement
+ MIB.add(MI.getOperand(CurOp++)); // segment
+ if (Opc == X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Undef);
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
case X86::PTCMMIMFP16PS:
case X86::PTCMMRLFP16PS: {
const MIMetadata MIMD(MI);
@@ -37605,6 +37653,39 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
+ case X86::PT2RPNTLVWZ0RS:
+ case X86::PT2RPNTLVWZ0RST1:
+ case X86::PT2RPNTLVWZ1RS:
+ case X86::PT2RPNTLVWZ1RST1: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ case X86::PT2RPNTLVWZ0RS:
+ Opc = X86::T2RPNTLVWZ0RS;
+ break;
+ case X86::PT2RPNTLVWZ0RST1:
+ Opc = X86::T2RPNTLVWZ0RST1;
+ break;
+ case X86::PT2RPNTLVWZ1RS:
+ Opc = X86::T2RPNTLVWZ1RS;
+ break;
+ case X86::PT2RPNTLVWZ1RST1:
+ Opc = X86::T2RPNTLVWZ1RST1;
+ break;
+ }
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
+
+ MIB.add(MI.getOperand(1)); // base
+ MIB.add(MI.getOperand(2)); // scale
+ MIB.add(MI.getOperand(3)); // index
+ MIB.add(MI.getOperand(4)); // displacement
+ MIB.add(MI.getOperand(5)); // segment
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
case X86::PTTRANSPOSED: {
const DebugLoc &DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 947a8bec2890ef..efd396cd2bc439 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -369,3 +369,94 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
}
}
} // HasAMXTILE, HasAMXTRANSPOSE
+
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
+ def T2RPNTLVWZ0RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T_MAP5;
+ def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T_MAP5;
+ def T2RPNTLVWZ1RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T_MAP5, PD;
+ def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T_MAP5, PD;
+ let isPseudo = true in {
+ def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ def PT2RPNTLVWZ1RSV : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ }
+ let usesCustomInserter = 1 in {
+ def PT2RPNTLVWZ0RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+ def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+ def PT2RPNTLVWZ1RS : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+ def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+ }
+} // HasAMXMOVRS, HasAMXTRANSPOSE
+
+let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
+ def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src1),
+ "tileloaddrs\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T8, XD;
+ def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src1),
+ "tileloaddrst1\t{$src1, $dst|$dst, $src1}",
+ []>, VEX, T8, PD;
+
+ let isPseudo = true, mayLoad = 1 in
+ def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3), []>;
+ let isPseudo = true, mayLoad = 1 in
+ def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3), []>;
+ let usesCustomInserter = 1 in {
+ let mayLoad = 1 in
+ def PTILELOADDRS : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+ let mayLoad = 1 in
+ def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+ }
+
+ def TILELOADDRSrm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src1),
+ "tileloaddrs\t{$src1, $dst|$dst, $src1}",
+ []>, EVEX, NoCD8, T8, XD;
+ def TILELOADDRST1rm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src1),
+ "tileloaddrst1\t{$src1, $dst|$dst, $src1}",
+ []>, EVEX, NoCD8, T8, PD;
+
+ def T2RPNTLVWZ0RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}",
+ []>, EVEX, NoCD8, T_MAP5;
+ def T2RPNTLVWZ0RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}",
+ []>, EVEX, NoCD8, T_MAP5;
+ def T2RPNTLVWZ1RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}",
+ []>, EVEX, NoCD8, T_MAP5, PD;
+ def T2RPNTLVWZ1RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src1),
+ "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}",
+ []>, EVEX, NoCD8, T_MAP5, PD;
+} // HasAMXMOVRS, In64BitMode
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 9b002ebd3a93bc..41bece5e2cec4e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4738,6 +4738,7 @@ static bool isAMXOpcode(unsigned Opc) {
case X86::TILELOADD_EVEX:
case X86::TILESTORED_EVEX:
case X86::PTILEPAIRLOAD:
+ case X86::TILELOADDRS:
return true;
}
}
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index d22e7dadaaa262..7a31e4212670b9 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -184,6 +184,7 @@ def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">;
def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">;
+def HasAMXMOVRS : Predicate<"Subtarget->hasAMXMOVRS()">;
def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">;
def HasUINTR : Predicate<"Subtarget->hasUINTR()">;
def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 688e886cf3b13a..c5c0f7a03b33a8 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -229,7 +229,13 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
case Intrinsic::x86_t2rpntlvwz1t1_internal:
case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tileloaddt164_internal:
- case Intrinsic::x86_tilestored64_internal: {
+ case Intrinsic::x86_tilestored64_internal:
+ case Intrinsic::x86_t2rpntlvwz0rs_internal:
+ case Intrinsic::x86_t2rpntlvwz0rst1_internal:
+ case Intrinsic::x86_t2rpntlvwz1rs_internal:
+ case Intrinsic::x86_t2rpntlvwz1rst1_internal:
+ case Intrinsic::x86_tileloaddrs64_internal:
+ case Intrinsic::x86_tileloaddrst164_internal: {
Row = II->getArgOperand(0);
Col = II->getArgOperand(1);
break;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 2daaa95b06be0d..6c7fc4bd49e80d 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1078,7 +1078,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
case X86::PTDPFP16PSV:
case X86::PTCMMIMFP16PSV:
case X86::PTCMMRLFP16PSV:
- case X86::PTTRANSPOSEDV: {
+ case X86::PTTRANSPOSEDV:
+ case X86::PTILELOADDRSV:
+ case X86::PTILELOADDRST1V: {
MachineOperand &MO1 = MI->getOperand(1);
MachineOperand &MO2 = MI->getOperand(2);
ShapeT Shape(&MO1, &MO2, MRI);
@@ -1088,7 +1090,11 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
case X86::PT2RPNTLVWZ0V:
case X86::PT2RPNTLVWZ0T1V:
case X86::PT2RPNTLVWZ1V:
- case X86::PT2RPNTLVWZ1T1V: {
+ case X86::PT2RPNTLVWZ1T1V:
+ case X86::PT2RPNTLVWZ0RSV:
+ case X86::PT2RPNTLVWZ0RST1V:
+ case X86::PT2RPNTLVWZ1RSV:
+ case X86::PT2RPNTLVWZ1RST1V: {
MachineOperand &MO1 = MI->getOperand(1);
MachineOperand &MO2 = MI->getOperand(2);
MachineOperand &MO3 = MI->getOperand(3);
diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
new file mode 100755
index 00000000000000..da212a1850964e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s
+
+define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
+; CHECK-LABEL: test_amx_internal:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK-NEXT: movw %ax, %cx
+; CHECK-NEXT: movw %di, %ax
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: tileloaddrs (%rdx,%rsi), %tmm0
+; CHECK-NEXT: movl $64, %esi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: retq
+entry:
+ %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
+ %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
+ ret void
+}
+declare x86_amx @llvm.x86.tileloaddrs64.internal(i16, i16, ptr, i64)
+
+define void @test_amx_old(i16 %m, i16 %n, ptr %buf) {
+; CHECK-LABEL: test_amx_old:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $32, %eax
+; CHECK-NEXT: tileloaddrs (%rdx,%rax), %tmm2
+; CHECK-NEXT: retq
+entry:
+ call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32)
+ ret void
+}
+declare void @llvm.x86.tileloaddrs64(i8 immarg, ptr, i64)
+
+define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
+; CHECK-LABEL: test_amx_t1_internal:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK-NEXT: movw %ax, %cx
+; CHECK-NEXT: movw %di, %ax
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: tileloaddrst1 (%rdx,%rsi), %tmm0
+; CHECK-NEXT: movl $64, %esi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: retq
+entry:
+ %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
+ %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
+ ret void
+}
+declare x86_amx @llvm.x86.tileloaddrst164.internal(i16, i16, ptr, i64)
+
+define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) {
+; CHECK-LABEL: test_amx_t1_old:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $32, %eax
+; CHECK-NEXT: tileloaddrst1 (%rdx,%rax), %tmm2
+; CHECK-NEXT: retq
+entry:
+ call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32)
+ ret void
+}
+declare void @llvm.x86.tileloaddrst164(i8 immarg, ptr, i64)
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
new file mode 100755
index 00000000000000..146b69773eb186
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
+; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
+
+define void @test_amx(i64 %stride, i8* %addr1) #0 {
+; CHECK-LABEL: test_amx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0
+; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2
+; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0
+; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
+; CHECK-NEXT: retq
+ call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
+ call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
+ call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
+ call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride)
+ ret void
+}
+declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 )
+declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 )
+declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 )
+declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 )
+
+define void @test_amx2(i8* %base, i64 %stride) #0 {
+; O0-LABEL: test_amx2:
+; O0: # %bb.0:
+; O0-NEXT: xorps %xmm0, %xmm0
+; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw $8, %ax
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
+; O0-NEXT: movw $8, %ax
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4
+; O0-NEXT: movw $8, %ax
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: # implicit-def: $al
+; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
+; O0-NEXT: tilerelease
+; O0-NEXT: retq
+;
+; O2-LABEL: test_amx2:
+; O2: # %bb.0:
+; O2-NEXT: xorps %xmm0, %xmm0
+; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; O2-NEXT: movw $8, %ax
+; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4
+; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
+; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4
+; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
+; O2-NEXT: tilerelease
+; O2-NEXT: retq
+ call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ ret void
+}
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64)
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
new file mode 100755
index 00000000000000..6df44c87d2332f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
@@ -0,0 +1,98 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL
+
+# ATT: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz0rs 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64]
+0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40
+
+# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
+0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz0rst1 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64]
+0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40
+
+# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz1rs 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64]
+0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40
+
+# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
+0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz1rst1 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64]
+0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40
+
+# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: tileloaddrs 268435456(%rbp,%r14,8), %tmm6
+# INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: tileloaddrs 291(%r8,%rax,4), %tmm3
+# INTEL: tileloaddrs tmm3, [r8 + 4*rax + 291]
+0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT: tileloaddrs 64(%rbx), %tmm6
+# INTEL: tileloaddrs tmm6, [rbx + 64]
+0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40
+
+# ATT: tileloaddrs -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrs tmm3, [2*rbp - 32]
+0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6
+# INTEL: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: tileloaddrst1 291(%r8,%rax,4), %tmm3
+# INTEL: tileloaddrst1 tmm3, [r8 + 4*rax + 291]
+0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT: tileloaddrst1 64(%rbx), %tmm6
+# INTEL: tileloaddrst1 tmm6, [rbx + 64]
+0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40
+
+# ATT: tileloaddrst1 -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
+0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
new file mode 100755
index 00000000000000..d780ad4f0e3691
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
@@ -0,0 +1,89 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
+ t2rpntlvwz0rs 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
+ t2rpntlvwz0rst1 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
+ t2rpntlvwz1rs 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
+ t2rpntlvwz1rst1 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ tileloaddrs 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: tileloaddrs 291(%r8,%rax,4), %tmm3
+// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+ tileloaddrs 291(%r8,%rax,4), %tmm3
+
+// CHECK: tileloaddrs -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+ tileloaddrs -32(,%rbp,2), %tmm3
+
+// CHECK: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ tileloaddrst1 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: tileloaddrst1 291(%r8,%rax,4), %tmm3
+// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+ tileloaddrst1 291(%r8,%rax,4), %tmm3
+
+// CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+ tileloaddrst1 -32(,%rbp,2), %tmm3
\ No newline at end of file
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
new file mode 100755
index 00000000000000..ccc7ac51a98a44
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
@@ -0,0 +1,97 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
+ t2rpntlvwz0rs tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
+ t2rpntlvwz0rst1 tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
+ t2rpntlvwz1rs tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
+ t2rpntlvwz1rst1 tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+
+// CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: tileloaddrs tmm3, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+ tileloaddrs tmm3, [r8 + 4*rax + 291]
+
+// CHECK: tileloaddrs tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40]
+ tileloaddrs tmm6, [rbx + 64]
+
+// CHECK: tileloaddrs tmm3, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+ tileloaddrs tmm3, [2*rbp - 32]
+
+// CHECK: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: tileloaddrst1 tmm3, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+ tileloaddrst1 tmm3, [r8 + 4*rax + 291]
+
+// CHECK: tileloaddrst1 tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40]
+ tileloaddrst1 tmm6, [rbx + 64]
+
+// CHECK: tileloaddrst1 tmm3, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+ tileloaddrst1 tmm3, [2*rbp - 32]
>From e7a09d7ccbbcd0ed222cdbc57236d2158306457e Mon Sep 17 00:00:00 2001
From: Malay Sanghi <malay.sanghi at intel.com>
Date: Wed, 6 Nov 2024 18:16:51 +0800
Subject: [PATCH 2/4] update test
---
clang/test/CodeGen/X86/amx_movrs_errors.c | 2 +-
clang/test/CodeGen/X86/amx_movrs_transpose_errors.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c
index bac7d962f5cb5c..2790126eb8672b 100755
--- a/clang/test/CodeGen/X86/amx_movrs_errors.c
+++ b/clang/test/CodeGen/X86/amx_movrs_errors.c
@@ -1,7 +1,7 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-reduce -target-feature +amx-memory \
-// RUN: -target-feature +amx-format -target-feature +amx-element -emit-llvm -verify
+// RUN: -target-feature +amx-format -target-feature +amx-element -verify
#include <immintrin.h>
#include <stddef.h>
diff --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
index c8846b36ffa874..840b52bbb29bbf 100755
--- a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
+++ b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
@@ -1,6 +1,6 @@
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \
-// RUN: -emit-llvm -verify
+// RUN: -verify
#include <immintrin.h>
#include <stddef.h>
>From 193420c854b52023e1a2874f05e1e019436111ab Mon Sep 17 00:00:00 2001
From: Malay Sanghi <malay.sanghi at intel.com>
Date: Mon, 11 Nov 2024 07:14:22 -0800
Subject: [PATCH 3/4] review
---
clang/docs/ReleaseNotes.rst | 1 +
clang/lib/Headers/CMakeLists.txt | 3 +-
clang/lib/Headers/amxmovrstransposeintrin.h | 201 ++++++++++++++++++
clang/lib/Headers/amxtransposeintrin.h | 177 ---------------
clang/lib/Headers/immintrin.h | 1 +
clang/test/CodeGen/X86/amx_movrs_errors.c | 4 +-
.../llvm/TargetParser/X86TargetParser.def | 1 +
llvm/lib/Target/X86/X86ExpandPseudo.cpp | 37 ++--
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 127 ++++-------
llvm/lib/Target/X86/X86ISelLowering.cpp | 65 ++----
llvm/lib/Target/X86/X86InstrAMX.td | 26 ---
llvm/lib/Target/X86/X86InstrInfo.cpp | 1 -
llvm/lib/TargetParser/Host.cpp | 1 +
llvm/lib/TargetParser/X86TargetParser.cpp | 1 +
14 files changed, 272 insertions(+), 374 deletions(-)
create mode 100644 clang/lib/Headers/amxmovrstransposeintrin.h
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c3424e0e6f34c9..302eb8bf3fd0b4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -739,6 +739,7 @@ X86 Support
* Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
- Support ISA of ``AMX-FP8``.
- Support ISA of ``AMX-TRANSPOSE``.
+- Support ISA of ``AMX-MOVRS``.
- Support ISA of ``AMX-AVX512``.
Arm and AArch64 Support
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index d7119be4ef8a85..e52a6f9dd40d9a 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -151,8 +151,9 @@ set(x86_files
amxfp16intrin.h
amxfp8intrin.h
amxintrin.h
- amxtransposeintrin.h
amxmovrsintrin.h
+ amxmovrstransposeintrin.h
+ amxtransposeintrin.h
avx10_2_512bf16intrin.h
avx10_2_512convertintrin.h
avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h
new file mode 100644
index 00000000000000..84360e6bef8388
--- /dev/null
+++ b/clang/lib/Headers/amxmovrstransposeintrin.h
@@ -0,0 +1,201 @@
+/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics -*- C++
+ * -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
+#define __AMX_MOVRS_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-transpose,amx-movrs")))
+
+#define _tile_2rpntlvwz0rs(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
+#define _tile_2rpntlvwz0rst1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
+#define _tile_2rpntlvwz1rs(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
+#define _tile_2rpntlvwz1rst1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ // Use __tile1024i_1024a* to escape the alignment check in
+ // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+ __builtin_ia32_t2rpntlvwz0rs_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz0rst1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1rs_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1rst1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#endif /* __x86_64__ */
+#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */
\ No newline at end of file
diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
index 086c9a75222ca1..b3fa37d766c45b 100644
--- a/clang/lib/Headers/amxtransposeintrin.h
+++ b/clang/lib/Headers/amxtransposeintrin.h
@@ -17,9 +17,6 @@
#define __DEFAULT_FN_ATTRS_TRANSPOSE \
__attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
-#define __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("amx-transpose,amx-movrs")))
#define _tile_2rpntlvwz0(tdst, base, stride) \
__builtin_ia32_t2rpntlvwz0(tdst, base, stride)
@@ -29,15 +26,6 @@
__builtin_ia32_t2rpntlvwz1(tdst, base, stride)
#define _tile_2rpntlvwz1t1(tdst, base, stride) \
__builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
-// MOVRS versions
-#define _tile_2rpntlvwz0rs(tdst, base, stride) \
- __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
-#define _tile_2rpntlvwz0rst1(tdst, base, stride) \
- __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
-#define _tile_2rpntlvwz1rs(tdst, base, stride) \
- __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
-#define _tile_2rpntlvwz1rst1(tdst, base, stride) \
- __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
/// Transpose 32-bit elements from \a src and write the result to \a dst.
///
@@ -113,45 +101,6 @@ _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
return __builtin_ia32_ttransposed_internal(m, n, src);
}
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
-_tile_2rpntlvwz0rs_internal(unsigned short row, unsigned short col0,
- unsigned short col1, _tile1024i *dst0,
- _tile1024i *dst1, const void *base,
- __SIZE_TYPE__ stride) {
- // Use __tile1024i_1024a* to escape the alignment check in
- // clang/test/Headers/x86-intrinsics-headers-clean.cpp
- __builtin_ia32_t2rpntlvwz0rs_internal(
- row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
- (__SIZE_TYPE__)(stride));
-}
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
-_tile_2rpntlvwz0rst1_internal(unsigned short row, unsigned short col0,
- unsigned short col1, _tile1024i *dst0,
- _tile1024i *dst1, const void *base,
- __SIZE_TYPE__ stride) {
- __builtin_ia32_t2rpntlvwz0rst1_internal(
- row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
- (__SIZE_TYPE__)(stride));
-}
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
-_tile_2rpntlvwz1rs_internal(unsigned short row, unsigned short col0,
- unsigned short col1, _tile1024i *dst0,
- _tile1024i *dst1, const void *base,
- __SIZE_TYPE__ stride) {
- __builtin_ia32_t2rpntlvwz1rs_internal(
- row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
- (__SIZE_TYPE__)(stride));
-}
-static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
-_tile_2rpntlvwz1rst1_internal(unsigned short row, unsigned short col0,
- unsigned short col1, _tile1024i *dst0,
- _tile1024i *dst1, const void *base,
- __SIZE_TYPE__ stride) {
- __builtin_ia32_t2rpntlvwz1rst1_internal(
- row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
- (__SIZE_TYPE__)(stride));
-}
-
/// Converts a pair of tiles from memory into VNNI format, and places the
/// results in a pair of destinations specified by dst. The pair of tiles
/// in memory is specified via a tsib; the second tile is after the first
@@ -280,131 +229,6 @@ static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
&dst1->tile, base, stride);
}
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-/// Provides a hint to the implementation that the data will likely become
-/// read shared in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
-///
-/// \param dst0
-/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
-static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
- const void *base, __SIZE_TYPE__ stride) {
- _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
- &dst1->tile, base, stride);
-}
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
-///
-/// \param dst0
-/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
-static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
- const void *base, __SIZE_TYPE__ stride) {
- _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
- &dst1->tile, base, stride);
-}
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely become
-/// read shared in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
-///
-/// \param dst0
-/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
-static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
- const void *base, __SIZE_TYPE__ stride) {
- _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
- &dst1->tile, base, stride);
-}
-/// Converts a pair of tiles from memory into VNNI format, and places the
-/// results in a pair of destinations specified by dst. The pair of tiles
-/// in memory is specified via a tsib; the second tile is after the first
-/// one, separated by the same stride that separates each row.
-/// The tile configuration for the destination tiles indicates the amount
-/// of data to read from memory. The instruction will load a number of rows
-/// that is equal to twice the number of rows in tmm1. The size of each row
-/// is equal to the average width of the destination tiles. If the second
-/// tile is configured with zero rows and columns, only the first tile will
-/// be written. The last row will be not be read from memory but instead
-/// filled with zeros.
-/// Provides a hint to the implementation that the data will likely become
-/// read shared in the near future and the data caching can be optimized.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
-///
-/// \param dst0
-/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param dst1
-/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
-static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
- const void *base, __SIZE_TYPE__ stride) {
- _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
- &dst1->tile, base, stride);
-}
-
/// Transpose 32-bit elements from src and write the result to dst.
///
/// \headerfile <immintrin.h>
@@ -420,6 +244,5 @@ static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
}
-#undef __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
#endif /* __x86_64__ */
#endif /* __AMX_TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 37e6ff071e26bd..574dc79fa1b24b 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -658,6 +658,7 @@ _storebe_i64(void * __P, long long __D) {
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__)
#include <amxmovrsintrin.h>
+#include <amxmovrstransposeintrin.h>
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c
index 2790126eb8672b..4263e75ce9a28c 100755
--- a/clang/test/CodeGen/X86/amx_movrs_errors.c
+++ b/clang/test/CodeGen/X86/amx_movrs_errors.c
@@ -1,7 +1,7 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-reduce -target-feature +amx-memory \
-// RUN: -target-feature +amx-format -target-feature +amx-element -verify
+// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 \
+// RUN: -target-feature +amx-element -verify
#include <immintrin.h>
#include <stddef.h>
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 815556e374bef5..026db53b2d9269 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -266,6 +266,7 @@ X86_FEATURE (MOVRS, "movrs")
X86_FEATURE (ZU, "zu")
X86_FEATURE (AMX_FP8, "amx-fp8")
X86_FEATURE (AMX_TRANSPOSE, "amx-transpose")
+X86_FEATURE (AMX_MOVRS, "amx-movrs")
X86_FEATURE (AMX_AVX512, "amx-avx512")
// These features aren't really CPU features, but the frontend can set them.
X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk")
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 3648a828b6d1a4..7c7c5f642a7032 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -557,17 +557,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.setDesc(TII->get(GET_EGPR_IF_ENABLED(X86::LDTILECFG)));
return true;
}
- case X86::PTILELOADDRSV:
- case X86::PTILELOADDRST1V: {
- for (unsigned i = 2; i > 0; --i)
- MI.removeOperand(i);
- unsigned Opc =
- Opcode == X86::PTILELOADDRSV ? X86::TILELOADDRS : X86::TILELOADDRST1;
- MI.setDesc(TII->get(Opc));
- return true;
- }
case X86::PTILELOADDV:
case X86::PTILELOADDT1V:
+ case X86::PTILELOADDRSV:
+ case X86::PTILELOADDRST1V:
case X86::PTCVTROWD2PSrreV:
case X86::PTCVTROWD2PSrriV:
case X86::PTCVTROWPS2PBF16HrreV:
@@ -584,6 +577,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.removeOperand(i);
unsigned Opc;
switch (Opcode) {
+ case X86::PTILELOADDRSV:
+ Opc = X86::TILELOADDRS;
+ break;
+ case X86::PTILELOADDRST1V:
+ Opc = X86::TILELOADDRST1;
+ break;
case X86::PTILELOADDV:
Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
break;
@@ -728,7 +727,11 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::PT2RPNTLVWZ0V:
case X86::PT2RPNTLVWZ0T1V:
case X86::PT2RPNTLVWZ1V:
- case X86::PT2RPNTLVWZ1T1V: {
+ case X86::PT2RPNTLVWZ1T1V:
+ case X86::PT2RPNTLVWZ0RSV:
+ case X86::PT2RPNTLVWZ0RST1V:
+ case X86::PT2RPNTLVWZ1RSV:
+ case X86::PT2RPNTLVWZ1RST1V: {
for (unsigned i = 3; i > 0; --i)
MI.removeOperand(i);
unsigned Opc;
@@ -745,20 +748,6 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::PT2RPNTLVWZ1T1V:
Opc = X86::T2RPNTLVWZ1T1;
break;
- default:
- llvm_unreachable("Impossible Opcode!");
- }
- MI.setDesc(TII->get(Opc));
- return true;
- }
- case X86::PT2RPNTLVWZ0RSV:
- case X86::PT2RPNTLVWZ0RST1V:
- case X86::PT2RPNTLVWZ1RSV:
- case X86::PT2RPNTLVWZ1RST1V: {
- for (unsigned i = 3; i > 0; --i)
- MI.removeOperand(i);
- unsigned Opc;
- switch (Opcode) {
case X86::PT2RPNTLVWZ0RSV:
Opc = X86::T2RPNTLVWZ0RS;
break;
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 96df1d8b464a28..e923d9438e626f 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5161,6 +5161,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, Res);
return;
}
+ case Intrinsic::x86_tileloaddrs64_internal:
+ case Intrinsic::x86_tileloaddrst164_internal:
+ if (!Subtarget->hasAMXMOVRS())
+ break;
+ [[fallthrough]];
case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tileloaddt164_internal: {
if (!Subtarget->hasAMXTILE())
@@ -5168,36 +5173,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
auto *MFI =
CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
- unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
- ? X86::PTILELOADDV
- : X86::PTILELOADDT1V;
- // _tile_loadd_internal(row, col, buf, STRIDE)
- SDValue Base = Node->getOperand(4);
- SDValue Scale = getI8Imm(1, dl);
- SDValue Index = Node->getOperand(5);
- SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = CurDAG->getRegister(0, MVT::i16);
- SDValue Chain = Node->getOperand(0);
- MachineSDNode *CNode;
- SDValue Ops[] = {Node->getOperand(2),
- Node->getOperand(3),
- Base,
- Scale,
- Index,
- Disp,
- Segment,
- Chain};
- CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
- ReplaceNode(Node, CNode);
- return;
- }
- case Intrinsic::x86_tileloaddrs64_internal:
- case Intrinsic::x86_tileloaddrst164_internal: {
- if (!Subtarget->hasAMXMOVRS())
+ unsigned Opc;
+ switch (IntNo) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_tileloaddrs64_internal:
+ Opc = X86::PTILELOADDRSV;
+ break;
+ case Intrinsic::x86_tileloaddrst164_internal:
+ Opc = X86::PTILELOADDRST1V;
break;
- unsigned Opc = IntNo == Intrinsic::x86_tileloaddrs64_internal
- ? X86::PTILELOADDRSV
- : X86::PTILELOADDRST1V;
+ case Intrinsic::x86_tileloadd64_internal:
+ Opc = X86::PTILELOADDV;
+ break;
+ case Intrinsic::x86_tileloaddt164_internal:
+ Opc = X86::PTILELOADDT1V;
+ break;
+ }
// _tile_loadd_internal(row, col, buf, STRIDE)
SDValue Base = Node->getOperand(4);
SDValue Scale = getI8Imm(1, dl);
@@ -5301,6 +5293,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, CNode);
return;
}
+ case Intrinsic::x86_tileloaddrs64:
+ case Intrinsic::x86_tileloaddrst164:
+ if (!Subtarget->hasAMXMOVRS())
+ break;
+ [[fallthrough]];
case Intrinsic::x86_tileloadd64:
case Intrinsic::x86_tileloaddt164:
case Intrinsic::x86_tilestored64: {
@@ -5313,46 +5310,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
switch (IntNo) {
default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
- case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
- case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
- }
- // FIXME: Match displacement and scale.
- unsigned TIndex = Node->getConstantOperandVal(2);
- SDValue TReg = getI8Imm(TIndex, dl);
- SDValue Base = Node->getOperand(3);
- SDValue Scale = getI8Imm(1, dl);
- SDValue Index = Node->getOperand(4);
- SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = CurDAG->getRegister(0, MVT::i16);
- SDValue Chain = Node->getOperand(0);
- MachineSDNode *CNode;
- if (Opc == X86::PTILESTORED) {
- SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
- CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
- } else {
- SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
- CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
- }
- ReplaceNode(Node, CNode);
- return;
- }
- case Intrinsic::x86_tileloaddrs64:
- case Intrinsic::x86_tileloaddrst164: {
- if (!Subtarget->hasAMXMOVRS())
- break;
- auto *MFI =
- CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
- MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
- unsigned Opc;
- switch (IntNo) {
- default:
- llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_tileloaddrs64:
Opc = X86::PTILELOADDRS;
break;
+ case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
case Intrinsic::x86_tileloaddrst164:
Opc = X86::PTILELOADDRST1;
break;
+ case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
}
// FIXME: Match displacement and scale.
unsigned TIndex = Node->getConstantOperandVal(2);
@@ -5365,15 +5330,22 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Chain = Node->getOperand(0);
MachineSDNode *CNode;
if (Opc == X86::PTILESTORED) {
- SDValue Ops[] = {Base, Scale, Index, Disp, Segment, TReg, Chain};
+ SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
} else {
- SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
+ SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
}
ReplaceNode(Node, CNode);
return;
}
+ case Intrinsic::x86_t2rpntlvwz0rs:
+ case Intrinsic::x86_t2rpntlvwz0rst1:
+ case Intrinsic::x86_t2rpntlvwz1rs:
+ case Intrinsic::x86_t2rpntlvwz1rst1:
+ if (!Subtarget->hasAMXTRANSPOSE() || !Subtarget->hasAMXMOVRS())
+ break;
+ [[fallthrough]];
case Intrinsic::x86_t2rpntlvwz0:
case Intrinsic::x86_t2rpntlvwz0t1:
case Intrinsic::x86_t2rpntlvwz1:
@@ -5399,31 +5371,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::x86_t2rpntlvwz1t1:
Opc = X86::PT2RPNTLVWZ1T1;
break;
- }
- // FIXME: Match displacement and scale.
- unsigned TIndex = Node->getConstantOperandVal(2);
- SDValue TReg = getI8Imm(TIndex, dl);
- SDValue Base = Node->getOperand(3);
- SDValue Scale = getI8Imm(1, dl);
- SDValue Index = Node->getOperand(4);
- SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = CurDAG->getRegister(0, MVT::i16);
- SDValue Chain = Node->getOperand(0);
- SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
- MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
- ReplaceNode(Node, CNode);
- return;
- }
- case Intrinsic::x86_t2rpntlvwz0rs:
- case Intrinsic::x86_t2rpntlvwz0rst1:
- case Intrinsic::x86_t2rpntlvwz1rs:
- case Intrinsic::x86_t2rpntlvwz1rst1: {
- if (!Subtarget->hasAMXTRANSPOSE() || !Subtarget->hasAMXMOVRS())
- break;
- unsigned Opc;
- switch (IntNo) {
- default:
- llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_t2rpntlvwz0rs:
Opc = X86::PT2RPNTLVWZ0RS;
break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 10062ff81c4709..839fedc34d1d09 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27328,9 +27328,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
case Intrinsic::x86_t2rpntlvwz0rst1_internal:
case Intrinsic::x86_t2rpntlvwz1rs_internal:
case Intrinsic::x86_t2rpntlvwz1rst1_internal:
- if (!Subtarget.hasAMXTRANSPOSE() || !Subtarget.hasAMXMOVRS())
- break;
- [[fallthrough]];
case Intrinsic::x86_t2rpntlvwz0_internal:
case Intrinsic::x86_t2rpntlvwz0t1_internal:
case Intrinsic::x86_t2rpntlvwz1_internal:
@@ -37527,6 +37524,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
return BB;
}
+ case X86::PTILELOADDRS:
+ case X86::PTILELOADDRST1:
case X86::PTILELOADD:
case X86::PTILELOADDT1:
case X86::PTILESTORED: {
@@ -37544,33 +37543,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
break;
#undef GET_EGPR_IF_ENABLED
- }
-
- MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
- unsigned CurOp = 0;
- if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
- MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
- RegState::Define);
-
- MIB.add(MI.getOperand(CurOp++)); // base
- MIB.add(MI.getOperand(CurOp++)); // scale
- MIB.add(MI.getOperand(CurOp++)); // index -- stride
- MIB.add(MI.getOperand(CurOp++)); // displacement
- MIB.add(MI.getOperand(CurOp++)); // segment
-
- if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
- MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
- RegState::Undef);
-
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
- }
- case X86::PTILELOADDRS:
- case X86::PTILELOADDRST1: {
- unsigned Opc;
- switch (MI.getOpcode()) {
- default:
- llvm_unreachable("illegal opcode!");
case X86::PTILELOADDRS:
Opc = X86::TILELOADDRS;
break;
@@ -37578,19 +37550,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
Opc = X86::TILELOADDRST1;
break;
}
+
MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
unsigned CurOp = 0;
- if (Opc != X86::TILESTORED)
+ if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
RegState::Define);
+
MIB.add(MI.getOperand(CurOp++)); // base
MIB.add(MI.getOperand(CurOp++)); // scale
MIB.add(MI.getOperand(CurOp++)); // index -- stride
MIB.add(MI.getOperand(CurOp++)); // displacement
MIB.add(MI.getOperand(CurOp++)); // segment
- if (Opc == X86::TILESTORED)
+
+ if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
RegState::Undef);
+
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
@@ -37613,6 +37589,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
+ case X86::PT2RPNTLVWZ0RS:
+ case X86::PT2RPNTLVWZ0RST1:
+ case X86::PT2RPNTLVWZ1RS:
+ case X86::PT2RPNTLVWZ1RST1:
case X86::PT2RPNTLVWZ0:
case X86::PT2RPNTLVWZ0T1:
case X86::PT2RPNTLVWZ1:
@@ -37634,27 +37614,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::PT2RPNTLVWZ1T1:
Opc = X86::T2RPNTLVWZ1T1;
break;
- }
- MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
- MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
-
- MIB.add(MI.getOperand(1)); // base
- MIB.add(MI.getOperand(2)); // scale
- MIB.add(MI.getOperand(3)); // index
- MIB.add(MI.getOperand(4)); // displacement
- MIB.add(MI.getOperand(5)); // segment
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
- }
- case X86::PT2RPNTLVWZ0RS:
- case X86::PT2RPNTLVWZ0RST1:
- case X86::PT2RPNTLVWZ1RS:
- case X86::PT2RPNTLVWZ1RST1: {
- const DebugLoc &DL = MI.getDebugLoc();
- unsigned Opc;
- switch (MI.getOpcode()) {
- default:
- llvm_unreachable("Unexpected instruction!");
case X86::PT2RPNTLVWZ0RS:
Opc = X86::T2RPNTLVWZ0RS;
break;
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 37bdc71fb5da29..68ba40eb315d77 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -433,32 +433,6 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
let mayLoad = 1 in
def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
}
-
- def TILELOADDRSrm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
- (ins sibmem:$src1),
- "tileloaddrs\t{$src1, $dst|$dst, $src1}",
- []>, EVEX, NoCD8, T8, XD;
- def TILELOADDRST1rm_EVEX : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
- (ins sibmem:$src1),
- "tileloaddrst1\t{$src1, $dst|$dst, $src1}",
- []>, EVEX, NoCD8, T8, PD;
-
- def T2RPNTLVWZ0RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
- (ins sibmem:$src1),
- "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}",
- []>, EVEX, NoCD8, T_MAP5;
- def T2RPNTLVWZ0RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
- (ins sibmem:$src1),
- "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}",
- []>, EVEX, NoCD8, T_MAP5;
- def T2RPNTLVWZ1RS_EVEX : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
- (ins sibmem:$src1),
- "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}",
- []>, EVEX, NoCD8, T_MAP5, PD;
- def T2RPNTLVWZ1RST1_EVEX : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
- (ins sibmem:$src1),
- "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}",
- []>, EVEX, NoCD8, T_MAP5, PD;
} // HasAMXMOVRS, In64BitMode
multiclass m_tcvtrowd2ps {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 850f74e666adc6..1b95450596314b 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4737,7 +4737,6 @@ static bool isAMXOpcode(unsigned Opc) {
case X86::TILELOADD_EVEX:
case X86::TILESTORED_EVEX:
case X86::PTILEPAIRLOAD:
- case X86::TILELOADDRS:
case X86::PTILEPAIRSTORE:
return true;
}
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index a973aaaa4806e6..a3322f7d328467 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1880,6 +1880,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
!getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX);
Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave;
Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
+ Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave;
Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave;
bool HasLeaf24 =
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index eb55e6fc9134c8..4039e6a0243cc9 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -600,6 +600,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
+constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 =
FeatureAMX_TILE | FeatureAVX10_2_512;
constexpr FeatureBitset ImpliedFeaturesHRESET = {};
>From cd6090f5ee51c45f961bc6ef47169d85d75bf78a Mon Sep 17 00:00:00 2001
From: Malay Sanghi <malay.sanghi at intel.com>
Date: Mon, 11 Nov 2024 20:45:10 -0800
Subject: [PATCH 4/4] review2
---
clang/lib/Headers/amxmovrstransposeintrin.h | 3 +--
clang/lib/Headers/immintrin.h | 4 ++++
clang/test/CodeGen/X86/amx_movrs_errors.c | 3 +--
llvm/lib/Target/X86/X86.td | 4 ++--
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +-
llvm/lib/Target/X86/X86InstrAMX.td | 17 ++++++++---------
llvm/lib/TargetParser/Host.cpp | 2 +-
7 files changed, 18 insertions(+), 17 deletions(-)
diff --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h
index 84360e6bef8388..17a9f7506a0421 100644
--- a/clang/lib/Headers/amxmovrstransposeintrin.h
+++ b/clang/lib/Headers/amxmovrstransposeintrin.h
@@ -1,5 +1,4 @@
-/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics -*- C++
- * -*---------===
+/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 502e1c12a4e91c..f0dd7160ec7ff4 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -658,6 +658,10 @@ _storebe_i64(void * __P, long long __D) {
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__)
#include <amxmovrsintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || \
+ (defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__))
#include <amxmovrstransposeintrin.h>
#endif
diff --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c
index 4263e75ce9a28c..495ea299236e2f 100755
--- a/clang/test/CodeGen/X86/amx_movrs_errors.c
+++ b/clang/test/CodeGen/X86/amx_movrs_errors.c
@@ -1,7 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 \
-// RUN: -target-feature +amx-element -verify
+// RUN: -target-feature +amx-movrs -verify
#include <immintrin.h>
#include <stddef.h>
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 3b95d06c464a39..509632183dc016 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -274,8 +274,8 @@ def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
"Support AMX-FP8 instructions",
[FeatureAMXTILE]>;
def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true",
- "Support AMX-MOVRS instructions",
- [FeatureAMXTILE]>;
+ "Support AMX-MOVRS instructions",
+ [FeatureAMXTILE]>;
def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true",
"Support AMX amx-transpose instructions",
[FeatureAMXTILE]>;
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index e923d9438e626f..72de0e0e8761f2 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5343,7 +5343,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::x86_t2rpntlvwz0rst1:
case Intrinsic::x86_t2rpntlvwz1rs:
case Intrinsic::x86_t2rpntlvwz1rst1:
- if (!Subtarget->hasAMXTRANSPOSE() || !Subtarget->hasAMXMOVRS())
+ if (!Subtarget->hasAMXMOVRS())
break;
[[fallthrough]];
case Intrinsic::x86_t2rpntlvwz0:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 282d85e9a4b6dc..059bfb4c70bcf9 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -419,18 +419,17 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
"tileloaddrst1\t{$src1, $dst|$dst, $src1}",
[]>, VEX, T8, PD;
- let isPseudo = true, mayLoad = 1 in
- def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
- GR16:$src2,
- opaquemem:$src3), []>;
- let isPseudo = true, mayLoad = 1 in
- def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ let isPseudo = true, mayLoad = 1 in {
+ def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
- let usesCustomInserter = 1 in {
- let mayLoad = 1 in
+ def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3), []>;
+ }
+
+ let usesCustomInserter = 1, mayLoad = 1 in {
def PTILELOADDRS : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
- let mayLoad = 1 in
def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
}
} // HasAMXMOVRS, In64BitMode
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 39115a6d7266ac..58ba2553633221 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1880,9 +1880,9 @@ const StringMap<bool> sys::getHostCPUFeatures() {
!getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX);
Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave;
Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
- Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave;
Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave;
Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave;
+ Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave;
bool HasLeaf24 =
MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);
More information about the cfe-commits
mailing list