[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)
Phoebe Wang via cfe-commits
cfe-commits at lists.llvm.org
Fri Nov 1 00:44:33 PDT 2024
https://github.com/phoebewang updated https://github.com/llvm/llvm-project/pull/113532
>From 1b42b133daaa028e0ac51e3f656aff2faed7a7c6 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Thu, 24 Oct 2024 14:35:37 +0800
Subject: [PATCH 1/3] [X86][AMX] Support AMX-TRANSPOSE
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
clang/docs/ReleaseNotes.rst | 2 +
clang/include/clang/Basic/BuiltinsX86_64.def | 11 +
clang/include/clang/Driver/Options.td | 2 +
clang/lib/Basic/Targets/X86.cpp | 6 +
clang/lib/Basic/Targets/X86.h | 1 +
clang/lib/CodeGen/CGBuiltin.cpp | 52 ++++
clang/lib/Headers/CMakeLists.txt | 1 +
clang/lib/Headers/amxintrin.h | 2 +
clang/lib/Headers/amxtransposeintrin.h | 248 ++++++++++++++++++
clang/lib/Headers/immintrin.h | 4 +
clang/lib/Sema/SemaX86.cpp | 6 +
clang/test/CodeGen/X86/amx_transpose.c | 36 +++
clang/test/CodeGen/X86/amx_transpose_api.c | 66 +++++
clang/test/CodeGen/X86/amx_transpose_errors.c | 31 +++
clang/test/Driver/x86-target-features.c | 7 +
clang/test/Preprocessor/x86_target_features.c | 12 +
llvm/include/llvm/CodeGen/TileShapeInfo.h | 87 +++++-
llvm/include/llvm/IR/IntrinsicsX86.td | 37 +++
.../Support/X86DisassemblerDecoderCommon.h | 1 +
.../llvm/TargetParser/X86TargetParser.def | 1 +
llvm/lib/Target/X86/AsmParser/X86Operand.h | 31 +++
.../X86/Disassembler/X86Disassembler.cpp | 5 +
.../X86/Disassembler/X86DisassemblerDecoder.h | 7 +
.../X86/MCTargetDesc/X86InstPrinterCommon.cpp | 19 ++
.../X86/MCTargetDesc/X86InstPrinterCommon.h | 1 +
llvm/lib/Target/X86/X86.td | 3 +
llvm/lib/Target/X86/X86ExpandPseudo.cpp | 125 +++++++++
llvm/lib/Target/X86/X86FastPreTileConfig.cpp | 53 ++--
llvm/lib/Target/X86/X86FastTileConfig.cpp | 40 ++-
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 70 +++++
llvm/lib/Target/X86/X86ISelLowering.cpp | 94 +++++++
llvm/lib/Target/X86/X86InstrAMX.td | 63 +++++
llvm/lib/Target/X86/X86InstrInfo.cpp | 12 +-
llvm/lib/Target/X86/X86InstrOperands.td | 7 +
llvm/lib/Target/X86/X86InstrPredicates.td | 1 +
llvm/lib/Target/X86/X86LowerAMXType.cpp | 246 +++++++++++++----
llvm/lib/Target/X86/X86PreTileConfig.cpp | 45 +++-
llvm/lib/Target/X86/X86RegisterInfo.cpp | 60 ++++-
llvm/lib/Target/X86/X86RegisterInfo.td | 9 +
llvm/lib/Target/X86/X86TileConfig.cpp | 82 +++++-
llvm/lib/TargetParser/Host.cpp | 4 +
llvm/lib/TargetParser/X86TargetParser.cpp | 1 +
.../CodeGen/X86/amx_tile_pair_O2_to_O0.ll | 136 ++++++++++
.../X86/amx_tile_pair_configure_O0.mir | 165 ++++++++++++
.../X86/amx_tile_pair_configure_O2.mir | 153 +++++++++++
llvm/test/CodeGen/X86/amx_tile_pair_copy.mir | 97 +++++++
.../X86/amx_tile_pair_lower_type_O0.ll | 86 ++++++
.../X86/amx_tile_pair_lower_type_O2.ll | 60 +++++
.../X86/amx_tile_pair_preconfigure_O0.mir | 134 ++++++++++
.../X86/amx_tile_pair_preconfigure_O2.mir | 113 ++++++++
.../CodeGen/X86/amx_transpose_intrinsics.ll | 150 +++++++++++
llvm/test/CodeGen/X86/ipra-reg-usage.ll | 4 +-
.../MC/Disassembler/X86/amx-transpose-att.s | 57 ++++
.../MC/Disassembler/X86/amx-transpose-att.txt | 58 ++++
.../MC/Disassembler/X86/amx-transpose-intel.s | 57 ++++
llvm/unittests/CodeGen/InstrRefLDVTest.cpp | 6 +-
llvm/utils/TableGen/X86RecognizableInstr.cpp | 4 +
57 files changed, 2751 insertions(+), 120 deletions(-)
create mode 100644 clang/lib/Headers/amxtransposeintrin.h
create mode 100644 clang/test/CodeGen/X86/amx_transpose.c
create mode 100644 clang/test/CodeGen/X86/amx_transpose_api.c
create mode 100644 clang/test/CodeGen/X86/amx_transpose_errors.c
create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
create mode 100644 llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
create mode 100644 llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
create mode 100644 llvm/test/MC/Disassembler/X86/amx-transpose-att.s
create mode 100644 llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
create mode 100644 llvm/test/MC/Disassembler/X86/amx-transpose-intel.s
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ce046a305c89b6..dc58f98af55cc9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -623,6 +623,8 @@ X86 Support
- All intrinsics in tbmintrin.h can now be used in constant expressions.
+- Support ISA of ``AMX-TRANSPOSE``.
+
Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 2c591edb2835cd..4e95a8a73d550a 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -128,6 +128,11 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i",
TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -148,6 +153,12 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
+
TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2ddb2f5312148e..c55f2b86f4cb1f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6287,6 +6287,8 @@ def mamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>;
def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
+def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
+def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 5448bd841959f4..fe5b600e6777fb 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -418,6 +418,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXTILE = true;
} else if (Feature == "+amx-complex") {
HasAMXCOMPLEX = true;
+ } else if (Feature == "+amx-transpose") {
+ HasAMXTRANSPOSE = true;
} else if (Feature == "+cmpccxadd") {
HasCMPCCXADD = true;
} else if (Feature == "+raoint") {
@@ -935,6 +937,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_FP16__");
if (HasAMXCOMPLEX)
Builder.defineMacro("__AMX_COMPLEX__");
+ if (HasAMXTRANSPOSE)
+ Builder.defineMacro("__AMX_TRANSPOSE__");
if (HasCMPCCXADD)
Builder.defineMacro("__CMPCCXADD__");
if (HasRAOINT)
@@ -1065,6 +1069,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("amx-fp16", true)
.Case("amx-int8", true)
.Case("amx-tile", true)
+ .Case("amx-transpose", true)
.Case("avx", true)
.Case("avx10.1-256", true)
.Case("avx10.1-512", true)
@@ -1182,6 +1187,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-fp16", HasAMXFP16)
.Case("amx-int8", HasAMXINT8)
.Case("amx-tile", HasAMXTILE)
+ .Case("amx-transpose", HasAMXTRANSPOSE)
.Case("avx", SSELevel >= AVX)
.Case("avx10.1-256", HasAVX10_1)
.Case("avx10.1-512", HasAVX10_1_512)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index a99ae62984c7d5..3e1fb41082950c 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -156,6 +156,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXINT8 = false;
bool HasAMXBF16 = false;
bool HasAMXCOMPLEX = false;
+ bool HasAMXTRANSPOSE = false;
bool HasSERIALIZE = false;
bool HasTSXLDTRK = false;
bool HasUSERMSR = false;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3f28b7f26c36fe..67d28ccec0f373 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16920,6 +16920,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// instruction, but it will create a memset that won't be optimized away.
return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
}
+ // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
+ case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
+ case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+ IID = Intrinsic::x86_t2rpntlvwz0_internal;
+ break;
+ case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
+ break;
+ case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz1_internal;
+ break;
+ case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
+ IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
+ break;
+ }
+
+ // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
+ Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
+ {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
+
+ auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
+ assert(PtrTy && "arg3 must be of pointer type");
+ QualType PtreeTy = PtrTy->getPointeeType();
+ llvm::Type *TyPtee = ConvertType(PtreeTy);
+
+ // Bitcast amx type (x86_amx) to vector type (256 x i32)
+ // Then store tile0 into DstPtr0
+ Value *T0 = Builder.CreateExtractValue(Call, 0);
+ Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+ {TyPtee}, {T0});
+ Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
+
+ // Then store tile1 into DstPtr1
+ Value *T1 = Builder.CreateExtractValue(Call, 1);
+ Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
+ {TyPtee}, {T1});
+ Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
+
+ // Note: Here we escape directly use x86_tilestored64_internal to store
+ // the results due to it can't make sure the Mem writen scope. This may
+ // cause shapes reloads after first amx intrinsic, which current amx reg-
+ // ister allocation has no ability to handle it.
+
+ return Store;
+ }
case X86::BI__ud2:
// llvm.trap makes a ud2a instruction on x86.
return EmitTrapCall(Intrinsic::trap);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..708525198324bb 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -149,6 +149,7 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ amxtransposeintrin.h
avx10_2_512bf16intrin.h
avx10_2_512convertintrin.h
avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index baa56f5b28e8e5..f07a5689011853 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -232,6 +232,8 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
/// represent 2D tile and the fixed size is maximum amx tile register size.
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+typedef int _tile1024i_1024a
+ __attribute__((__vector_size__(1024), __aligned__(1024)));
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
new file mode 100644
index 00000000000000..d5dc68f4152848
--- /dev/null
+++ b/clang/lib/Headers/amxtransposeintrin.h
@@ -0,0 +1,248 @@
+/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_TRANSPOSEINTRIN_H
+#define __AMX_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TRANSPOSE \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+
+#define _tile_2rpntlvwz0(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
+#define _tile_2rpntlvwz0t1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
+#define _tile_2rpntlvwz1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
+#define _tile_2rpntlvwz1t1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+
+/// Transpose 32-bit elements from \a src and write the result to \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void __tile_transposed(__tile dst, __tile src);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// The 1st source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+///
+/// FOR i := 0 TO (dst.rows-1)
+/// tmp[511:0] := 0
+/// FOR j := 0 TO (dst.colsb/4-1)
+/// tmp.dword[j] := src.row[j].dword[i]
+/// ENDFOR
+/// dst.row[i] := tmp
+/// ENDFOR
+///
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ // Use __tile1024i_1024a* to escape the alignment check in
+ // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+ __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+ (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz0t1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+ (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1t1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
+_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
+ return __builtin_ia32_ttransposed_internal(m, n, src);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Transpose 32-bit elements from src and write the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// The 1st source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
+ dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
+}
+
+#endif /* __x86_64__ */
+#endif /* __AMX_TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 3fbabffa98df20..b6c13a6c8ec780 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -638,6 +638,10 @@ _storebe_i64(void * __P, long long __D) {
#include <amxcomplexintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TRANSPOSE__)
+#include <amxtransposeintrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include <avx512vp2intersectintrin.h>
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 6a4d78f0ca9084..9e250ac8f906cf 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -631,6 +631,10 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_tileloaddt164:
case X86::BI__builtin_ia32_tilestored64:
case X86::BI__builtin_ia32_tilezero:
+ case X86::BI__builtin_ia32_t2rpntlvwz0:
+ case X86::BI__builtin_ia32_t2rpntlvwz0t1:
+ case X86::BI__builtin_ia32_t2rpntlvwz1:
+ case X86::BI__builtin_ia32_t2rpntlvwz1t1:
return CheckBuiltinTileArgumentsRange(TheCall, 0);
case X86::BI__builtin_ia32_tdpbssd:
case X86::BI__builtin_ia32_tdpbsud:
@@ -641,6 +645,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_tcmmimfp16ps:
case X86::BI__builtin_ia32_tcmmrlfp16ps:
return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
+ case X86::BI__builtin_ia32_ttransposed:
+ return CheckBuiltinTileArgumentsRange(TheCall, {0, 1});
}
}
static bool isX86_32Builtin(unsigned BuiltinID) {
diff --git a/clang/test/CodeGen/X86/amx_transpose.c b/clang/test/CodeGen/X86/amx_transpose.c
new file mode 100644
index 00000000000000..deefc592c7ae66
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_transpose.c
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-transpose \
+// RUN: -target-feature +avx512f -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+void test_tile_2rpntlvwz0(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz0
+ // CHECK: call void @llvm.x86.t2rpntlvwz0(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz0(1, A, B);
+}
+
+void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz0t1
+ // CHECK: call void @llvm.x86.t2rpntlvwz0t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz0t1(1, A, B);
+}
+
+void test_tile_2rpntlvwz1(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz1
+ // CHECK: call void @llvm.x86.t2rpntlvwz1(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz1(1, A, B);
+}
+
+void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
+ // CHECK-LABEL: @test_tile_2rpntlvwz1t1
+ // CHECK: call void @llvm.x86.t2rpntlvwz1t1(i8 1, ptr %{{.*}}, i64 %{{.*}})
+ _tile_2rpntlvwz1t1(1, A, B);
+}
+
+void test_tile_transposed(void)
+{
+ // CHECK-LABEL: @test_tile_transposed
+ // CHECK: call void @llvm.x86.ttransposed(i8 1, i8 2)
+ _tile_transposed(1, 2);
+}
diff --git a/clang/test/CodeGen/X86/amx_transpose_api.c b/clang/test/CodeGen/X86/amx_transpose_api.c
new file mode 100644
index 00000000000000..10310c2332b7a6
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_transpose_api.c
@@ -0,0 +1,66 @@
+// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f \
+// RUN: -target-feature +amx-transpose \
+// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
+
+#include <immintrin.h>
+
+char buf[2048];
+#define STRIDE 32
+
+char buf2[2048];
+
+void test_tile_2rpntlvwz0(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test_tile_2rpntlvwz0
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz0(&dst0, &dst1, buf, STRIDE);
+}
+
+void test_tile_2rpntlvwz0t1(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test_tile_2rpntlvwz0t1
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz0t1(&dst0, &dst1, buf, STRIDE);
+}
+
+void test_tile_2rpntlvwz1(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test_tile_2rpntlvwz1
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz1(&dst0, &dst1, buf, STRIDE);
+}
+
+void test_tile_2rpntlvwz1t1(__tile1024i dst0, __tile1024i dst1) {
+ //CHECK-LABEL: @test_tile_2rpntlvwz1t1
+ //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+ //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+ __tile_2rpntlvwz1t1(&dst0, &dst1, buf, STRIDE);
+}
+
+void test_tile_transposed(__tile1024i dst, __tile1024i src) {
+ //CHECK-LABEL: @test_tile_transposed
+ //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+ //CHECK-DAG: call x86_amx @llvm.x86.ttransposed.internal
+ //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ __tile_transposed(&dst, src);
+}
diff --git a/clang/test/CodeGen/X86/amx_transpose_errors.c b/clang/test/CodeGen/X86/amx_transpose_errors.c
new file mode 100644
index 00000000000000..80084c42a240dd
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_transpose_errors.c
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-transpose \
+// RUN: -target-feature +avx512f -target-feature +amx-element-evex -verify
+
+#include <immintrin.h>
+#include <stddef.h>
+#include <immintrin.h>
+#include <stddef.h>
+
+// Transpose
+void test_tile_2rpntlvwz0(const void *A, size_t B) {
+ _tile_2rpntlvwz0(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz0t1(const void *A, size_t B) {
+ _tile_2rpntlvwz0t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz1(const void *A, size_t B) {
+ _tile_2rpntlvwz1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
+ _tile_2rpntlvwz1t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_transposed()
+{
+ _tile_transposed(8, 2); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ _tile_transposed(1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index ddfbb29a48f8d5..d203a81e6d1e9a 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -304,6 +304,13 @@
// AMX-COMPLEX: "-target-feature" "+amx-complex"
// NO-AMX-COMPLEX: "-target-feature" "-amx-complex"
+// RUN: %clang -target x86_64-unknown-linux-gnu -mamx-transpose %s \
+// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TRANSPOSE %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -mno-amx-transpose %s \
+// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s
+// AMX-TRANSPOSE: "-target-feature" "+amx-transpose"
+// NO-AMX-TRANSPOSE: "-target-feature" "-amx-transpose"
+
// RUN: %clang --target=i386 -march=i386 -mhreset %s -### 2>&1 | FileCheck -check-prefix=HRESET %s
// RUN: %clang --target=i386 -march=i386 -mno-hreset %s -### 2>&1 | FileCheck -check-prefix=NO-HRESET %s
// HRESET: "-target-feature" "+hreset"
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 8b4e6bdc09226a..7d16f3fa240bc0 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -546,6 +546,18 @@
// NO-AMX-COMPLEX-NOT: #define __AMX_COMPLEX__ 1
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -x c \
+// RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-TRANSPOSE %s
+
+// AMX-TRANSPOSE: #define __AMX_TRANSPOSE__ 1
+
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-transpose -x c \
+// RUN: -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-transpose -mno-amx-tile \
+// RUN: -x c -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TRANSPOSE %s
+
+// NO-AMX-TRANSPOSE-NOT: #define __AMX_TRANSPOSE__ 1
+
// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s
// AVXVNNI: #define __AVX2__ 1
diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h
index d00fe5c5535f5e..0e0a883b0c5953 100644
--- a/llvm/include/llvm/CodeGen/TileShapeInfo.h
+++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -34,9 +34,31 @@ class ShapeT {
if (MRI)
deduceImm(MRI);
}
+ // When ShapeT has mult shapes, we only use Shapes (never use Row and Col)
+ // and ImmShapes. Due to the most case is only one shape (just simply use
+ // Shape.Row or Shape.Col), so here we don't merge Row and Col into vertor
+ // Shapes to keep the speed and code simplicity.
+ // TODO: The upper solution is a temporary way to minimize current tile
+ // register allocation code changes. It can not handle both Reg shape and
+ // Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2
+ // is imm shape). Refine me when we have more mult-tile shape instructions!
+ ShapeT(ArrayRef<MachineOperand *> ShapesOperands,
+ const MachineRegisterInfo *MRI = nullptr)
+ : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
+ ColImm(InvalidImmShape) {
+ assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!");
+
+ for (auto *Shape : ShapesOperands)
+ Shapes.push_back(Shape);
+
+ if (MRI)
+ deduceImm(MRI);
+ }
ShapeT()
: Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
ColImm(InvalidImmShape) {}
+ // TODO: We need to extern cmp operator for muti-shapes if
+ // we have requirement in the future.
bool operator==(const ShapeT &Shape) const {
MachineOperand *R = Shape.Row;
MachineOperand *C = Shape.Col;
@@ -53,13 +75,40 @@ class ShapeT {
bool operator!=(const ShapeT &Shape) const { return !(*this == Shape); }
- MachineOperand *getRow() const { return Row; }
+ MachineOperand *getRow(unsigned I = 0) const {
+ if (Shapes.empty())
+ return Row;
+ assert(Shapes.size() / 2 >= I && "Get invalid row from id!");
+ return Shapes[I * 2];
+ }
- MachineOperand *getCol() const { return Col; }
+ MachineOperand *getCol(unsigned I = 0) const {
+ if (Shapes.empty())
+ return Col;
+ assert(Shapes.size() / 2 >= I && "Get invalid col from id!");
+ return Shapes[I * 2 + 1];
+ }
- int64_t getRowImm() const { return RowImm; }
+ int64_t getRowImm(unsigned I = 0) const {
+ if (ImmShapes.empty())
+ return RowImm;
+ assert(ImmShapes.size() / 2 >= I && "Get invalid imm row from id!");
+ return ImmShapes[I * 2];
+ }
- int64_t getColImm() const { return ColImm; }
+ int64_t getColImm(unsigned I = 0) const {
+ if (ImmShapes.empty())
+ return ColImm;
+ assert(ImmShapes.size() / 2 >= I && "Get invalid imm col from id!");
+ return ImmShapes[I * 2 + 1];
+ }
+
+ unsigned getShapeNum() {
+ if (Shapes.empty())
+ return isValid() ? 1 : 0;
+ else
+ return Shapes.size() / 2;
+ }
bool isValid() { return (Row != nullptr) && (Col != nullptr); }
@@ -72,14 +121,35 @@ class ShapeT {
for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
const auto *MI = DefMO.getParent();
if (MI->isMoveImmediate()) {
- Imm = MI->getOperand(1).getImm();
+ assert(MI->getNumOperands() == 2 &&
+ "Unsupported number of operands in instruction for setting "
+ "row/column.");
+ if (MI->getOperand(1).isImm()) {
+ Imm = MI->getOperand(1).getImm();
+ } else {
+ assert(MI->getOperand(1).isImplicit() &&
+ "Operand 1 is assumed to be implicit.");
+ Imm = 0;
+ }
break;
}
}
return Imm;
};
- RowImm = GetImm(Row->getReg());
- ColImm = GetImm(Col->getReg());
+ if (Shapes.empty()) { // Single Shape
+ RowImm = GetImm(Row->getReg());
+ ColImm = GetImm(Col->getReg());
+ // The number of rows of 2nd destination buffer is assigned by the one of
+ // 1st destination buffer. If the column size is equal to zero, the row
+ // size should be reset to zero too.
+ if (ColImm == 0)
+ Row = Col;
+ } else { // Multiple Shapes
+ for (auto *Shape : Shapes) {
+ int64_t ImmShape = GetImm(Shape->getReg());
+ ImmShapes.push_back(ImmShape);
+ }
+ }
}
private:
@@ -88,6 +158,9 @@ class ShapeT {
MachineOperand *Col;
int64_t RowImm = -1;
int64_t ColImm = -1;
+ // Multiple Shapes
+ SmallVector<MachineOperand *, 0> Shapes;
+ SmallVector<int64_t, 0> ImmShapes;
};
} // namespace llvm
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 5262e3154ff721..0c27ac5d7cec7a 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5917,6 +5917,23 @@ let TargetPrefix = "x86" in {
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
ImmArg<ArgIndex<2>>]>;
+ // AMX-TRANSPOSE
+ def int_x86_t2rpntlvwz0 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_t2rpntlvwz0t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0t1">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_t2rpntlvwz1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_t2rpntlvwz1t1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1t1">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+ [ImmArg<ArgIndex<0>>]>;
+ def int_x86_ttransposed : ClangBuiltin<"__builtin_ia32_ttransposed">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+
// AMX - internal intrinsics
def int_x86_ldtilecfg_internal :
ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">,
@@ -5994,6 +6011,26 @@ let TargetPrefix = "x86" in {
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
llvm_x86amx_ty, llvm_x86amx_ty,
llvm_x86amx_ty], []>;
+ def int_x86_t2rpntlvwz0_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
+ def int_x86_t2rpntlvwz0t1_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
+ def int_x86_t2rpntlvwz1_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
+ def int_x86_t2rpntlvwz1t1_internal :
+ Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
+ def int_x86_ttransposed_internal :
+ ClangBuiltin<"__builtin_ia32_ttransposed_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index 5ec8a718d5a3eb..1e07fbe64f7d39 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -511,6 +511,7 @@ enum OperandEncoding { ENCODINGS ENCODING_max };
ENUM_ENTRY(TYPE_VK, "mask register") \
ENUM_ENTRY(TYPE_VK_PAIR, "mask register pair") \
ENUM_ENTRY(TYPE_TMM, "tile") \
+ ENUM_ENTRY(TYPE_TMM_PAIR, "tile pair") \
ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \
ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \
ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index e5bf196559ba63..53de78e9789fbe 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -262,6 +262,7 @@ X86_FEATURE_COMPAT(AVX10_1_512, "avx10.1-512", 37)
X86_FEATURE_COMPAT(AVX10_2, "avx10.2-256", 0)
X86_FEATURE_COMPAT(AVX10_2_512, "avx10.2-512", 0)
X86_FEATURE (ZU, "zu")
+X86_FEATURE (AMX_TRANSPOSE, "amx-transpose")
// These features aren't really CPU features, but the frontend can set them.
X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk")
X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 03c333b90108e2..07a00af881afe0 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -623,6 +623,37 @@ struct X86Operand final : public MCParsedAsmOperand {
Inst.addOperand(MCOperand::createReg(Reg));
}
+ bool isTILEPair() const {
+ return Kind == Register &&
+ X86MCRegisterClasses[X86::TILERegClassID].contains(getReg());
+ }
+
+ void addTILEPairOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ unsigned Reg = getReg();
+ switch (Reg) {
+ default:
+ llvm_unreachable("Invalid tile register!");
+ case X86::TMM0:
+ case X86::TMM1:
+ Reg = X86::TMM0_TMM1;
+ break;
+ case X86::TMM2:
+ case X86::TMM3:
+ Reg = X86::TMM2_TMM3;
+ break;
+ case X86::TMM4:
+ case X86::TMM5:
+ Reg = X86::TMM4_TMM5;
+ break;
+ case X86::TMM6:
+ case X86::TMM7:
+ Reg = X86::TMM6_TMM7;
+ break;
+ }
+ Inst.addOperand(MCOperand::createReg(Reg));
+ }
+
void addMemOperands(MCInst &Inst, unsigned N) const {
assert((N == 5) && "Invalid number of operands!");
if (getMemBaseReg())
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index ee1c8144f681e5..f198234f1ca302 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -806,6 +806,10 @@ static int readModRM(struct InternalInstruction *insn) {
if (index > 7) \
*valid = 0; \
return prefix##_TMM0 + index; \
+ case TYPE_TMM_PAIR: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_TMM0_TMM1 + (index / 2); \
case TYPE_VK: \
index &= 0xf; \
if (index > 7) \
@@ -2315,6 +2319,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_YMM:
case TYPE_ZMM:
case TYPE_TMM:
+ case TYPE_TMM_PAIR:
case TYPE_VK_PAIR:
case TYPE_VK:
case TYPE_DEBUGREG:
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index b0aa70be12d837..dc9af2caa77b19 100644
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -535,6 +535,12 @@ namespace X86Disassembler {
ENTRY(TMM6) \
ENTRY(TMM7)
+#define REGS_TMM_PAIRS \
+ ENTRY(TMM0_TMM1) \
+ ENTRY(TMM2_TMM3) \
+ ENTRY(TMM4_TMM5) \
+ ENTRY(TMM6_TMM7)
+
#define ALL_EA_BASES \
EA_BASES_16BIT \
EA_BASES_32BIT \
@@ -559,6 +565,7 @@ namespace X86Disassembler {
REGS_DEBUG \
REGS_CONTROL \
REGS_TMM \
+ REGS_TMM_PAIRS \
ENTRY(RIP)
/// All possible values of the base field for effective-address
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index e7ba13215feb59..51b82321d679bf 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -463,3 +463,22 @@ void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
}
llvm_unreachable("Unknown mask pair register name");
}
+
+void X86InstPrinterCommon::printTILEPair(const MCInst *MI, unsigned OpNo,
+ raw_ostream &OS) {
+ switch (MI->getOperand(OpNo).getReg()) {
+ case X86::TMM0_TMM1:
+ printRegName(OS, X86::TMM0);
+ return;
+ case X86::TMM2_TMM3:
+ printRegName(OS, X86::TMM2);
+ return;
+ case X86::TMM4_TMM5:
+ printRegName(OS, X86::TMM4);
+ return;
+ case X86::TMM6_TMM7:
+ printRegName(OS, X86::TMM6);
+ return;
+ }
+ llvm_unreachable("Unknown mask pair register name");
+}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index 221102e17c6538..2a7b750bd67521 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -38,6 +38,7 @@ class X86InstPrinterCommon : public MCInstPrinter {
const MCSubtargetInfo &STI);
void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printTILEPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
};
} // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index d57450d91ea2dd..9a2ff76090f7d0 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -270,6 +270,9 @@ def FeatureAMXFP16 : SubtargetFeature<"amx-fp16", "HasAMXFP16", "true",
def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true",
"Support AMX-COMPLEX instructions",
[FeatureAMXTILE]>;
+def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true",
+ "Support AMX amx-transpose instructions",
+ [FeatureAMXTILE]>;
def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true",
"Support CMPCCXADD instructions">;
def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true",
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index f4c67f115c9f3d..5584c08a983037 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -568,6 +568,131 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.setDesc(TII->get(Opc));
return true;
}
+ // TILEPAIRLOAD is just for TILEPair spill, we don't have corresponding
+ // AMX instruction to support it. So, split it to 2 load instructions:
+ // "TILEPAIRLOAD TMM0:TMM1, Base, Scale, Index, Offset, Segment" -->
+ // "TILELOAD TMM0, Base, Scale, Index, Offset, Segment" +
+ // "TILELOAD TMM1, Base, Scale, Index, Offset + TMM_SIZE, Segment"
+ case X86::PTILEPAIRLOAD: {
+ int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm();
+ Register TReg = MBBI->getOperand(0).getReg();
+ bool DstIsDead = MBBI->getOperand(0).isDead();
+ Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
+ Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
+ unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
+
+ MachineInstrBuilder MIBLo =
+ BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
+ .addReg(TReg0, RegState::Define | getDeadRegState(DstIsDead));
+ MachineInstrBuilder MIBHi =
+ BuildMI(MBB, MBBI, DL, TII->get(X86::TILELOADD))
+ .addReg(TReg1, RegState::Define | getDeadRegState(DstIsDead));
+
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ MIBLo.add(MBBI->getOperand(1 + i));
+ if (i == X86::AddrDisp)
+ MIBHi.addImm(Disp + TmmSize);
+ else
+ MIBHi.add(MBBI->getOperand(1 + i));
+ }
+
+ // Make sure the first stride reg used in first tileload is alive.
+ MachineOperand &Stride =
+ MIBLo.getInstr()->getOperand(1 + X86::AddrIndexReg);
+ Stride.setIsKill(false);
+
+ // Split the memory operand, adjusting the offset and size for the halves.
+ MachineMemOperand *OldMMO = MBBI->memoperands().front();
+ MachineFunction *MF = MBB.getParent();
+ MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
+ MachineMemOperand *MMOHi =
+ MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
+
+ MIBLo.setMemRefs(MMOLo);
+ MIBHi.setMemRefs(MMOHi);
+
+ // Delete the pseudo.
+ MBB.erase(MBBI);
+ return true;
+ }
+ // Smilar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no
+ // corresponding AMX instruction to support it. So, split it too:
+ // "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" -->
+ // "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" +
+ // "TILESTORE Base, Scale, Index, Offset + TMM_SIZE, Segment, TMM1"
+ case X86::PTILEPAIRSTORE: {
+ int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm();
+ Register TReg = MBBI->getOperand(X86::AddrNumOperands).getReg();
+ bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill();
+ Register TReg0 = TRI->getSubReg(TReg, X86::sub_t0);
+ Register TReg1 = TRI->getSubReg(TReg, X86::sub_t1);
+ unsigned TmmSize = TRI->getRegSizeInBits(X86::TILERegClass) / 8;
+
+ MachineInstrBuilder MIBLo =
+ BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
+ MachineInstrBuilder MIBHi =
+ BuildMI(MBB, MBBI, DL, TII->get(X86::TILESTORED));
+
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ MIBLo.add(MBBI->getOperand(i));
+ if (i == X86::AddrDisp)
+ MIBHi.addImm(Disp + TmmSize);
+ else
+ MIBHi.add(MBBI->getOperand(i));
+ }
+ MIBLo.addReg(TReg0, getKillRegState(SrcIsKill));
+ MIBHi.addReg(TReg1, getKillRegState(SrcIsKill));
+
+ // Make sure the first stride reg used in first tilestore is alive.
+ MachineOperand &Stride = MIBLo.getInstr()->getOperand(X86::AddrIndexReg);
+ Stride.setIsKill(false);
+
+ // Split the memory operand, adjusting the offset and size for the halves.
+ MachineMemOperand *OldMMO = MBBI->memoperands().front();
+ MachineFunction *MF = MBB.getParent();
+ MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, TmmSize);
+ MachineMemOperand *MMOHi =
+ MF->getMachineMemOperand(OldMMO, TmmSize, TmmSize);
+
+ MIBLo.setMemRefs(MMOLo);
+ MIBHi.setMemRefs(MMOHi);
+
+ // Delete the pseudo.
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::PT2RPNTLVWZ0V:
+ case X86::PT2RPNTLVWZ0T1V:
+ case X86::PT2RPNTLVWZ1V:
+ case X86::PT2RPNTLVWZ1T1V: {
+ for (unsigned i = 3; i > 0; --i)
+ MI.removeOperand(i);
+ unsigned Opc;
+ switch (Opcode) {
+ case X86::PT2RPNTLVWZ0V:
+ Opc = X86::T2RPNTLVWZ0;
+ break;
+ case X86::PT2RPNTLVWZ0T1V:
+ Opc = X86::T2RPNTLVWZ0T1;
+ break;
+ case X86::PT2RPNTLVWZ1V:
+ Opc = X86::T2RPNTLVWZ1;
+ break;
+ case X86::PT2RPNTLVWZ1T1V:
+ Opc = X86::T2RPNTLVWZ1T1;
+ break;
+ default:
+ llvm_unreachable("Impossible Opcode!");
+ }
+ MI.setDesc(TII->get(Opc));
+ return true;
+ }
+ case X86::PTTRANSPOSEDV: {
+ for (int i = 2; i > 0; --i)
+ MI.removeOperand(i);
+ MI.setDesc(TII->get(X86::TTRANSPOSED));
+ return true;
+ }
case X86::PTCMMIMFP16PSV:
case X86::PTCMMRLFP16PSV:
case X86::PTDPBSSDV:
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index d50a4d3b23ae22..fd0987db5e4f55 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -268,24 +268,36 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI,
<< printReg(TileReg, TRI) << '\n');
}
+unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) {
+ if (Reg.isVirtual()) {
+ unsigned RegClassID = MRI->getRegClass(Reg)->getID();
+ if (RegClassID == X86::TILERegClassID)
+ return 1;
+ if (RegClassID == X86::TILEPAIRRegClassID)
+ return 2;
+ } else {
+ if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+ return 1;
+ if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
+ return 2;
+ }
+ return 0;
+}
+
+static bool isTileRegister(MachineRegisterInfo *MRI, Register VirtReg) {
+ return getTileDefNum(MRI, VirtReg) > 0;
+}
+
static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
// The instruction must have 3 operands: tile def, row, col.
if (MI.isDebugInstr() || MI.getNumOperands() < 3 || !MI.isPseudo())
return false;
MachineOperand &MO = MI.getOperand(0);
- if (MO.isReg()) {
- Register Reg = MO.getReg();
- // FIXME it may be used after Greedy RA and the physical
- // register is not rewritten yet.
- if (Reg.isVirtual() &&
- MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
- return true;
- if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
- return true;
- }
+ if (!MO.isReg())
+ return false;
- return false;
+ return getTileDefNum(MRI, MO.getReg()) > 0;
}
static ShapeT getShape(MachineRegisterInfo *MRI, Register TileReg) {
@@ -424,8 +436,7 @@ void X86FastPreTileConfig::convertPHI(MachineBasicBlock *MBB,
static bool isTileRegDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
MachineOperand &MO = MI.getOperand(0);
- if (MO.isReg() && MO.getReg().isVirtual() &&
- MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID)
+ if (MO.isReg() && MO.getReg().isVirtual() && isTileRegister(MRI, MO.getReg()))
return true;
return false;
}
@@ -524,8 +535,7 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
- if (Reg.isVirtual() &&
- MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
+ if (Reg.isVirtual() && isTileRegister(MRI, Reg))
return true;
}
return false;
@@ -617,6 +627,19 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
else if (dominates(MBB, LastShapeMI, ColMI))
LastShapeMI = ColMI;
}
+ unsigned TileDefNum = getTileDefNum(MRI, MI.getOperand(0).getReg());
+ if (TileDefNum > 1) {
+ for (unsigned I = 1; I < TileDefNum; I++) {
+ MachineOperand *ColxMO = &MI.getOperand(2 + I);
+ MachineInstr *ColxMI = MRI->getVRegDef(ColxMO->getReg());
+ if (ColxMI->getParent() == &MBB) {
+ if (!LastShapeMI)
+ LastShapeMI = ColxMI;
+ else if (dominates(MBB, LastShapeMI, ColxMI))
+ LastShapeMI = ColxMI;
+ }
+ }
+ }
// If there is user live out of the tilecfg, spill it and reload in
// before the user.
Register TileReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 70bc11228be6aa..72264dd6a5c38f 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -80,28 +80,41 @@ INITIALIZE_PASS_BEGIN(X86FastTileConfig, DEBUG_TYPE,
INITIALIZE_PASS_END(X86FastTileConfig, DEBUG_TYPE,
"Fast Tile Register Configure", false, false)
-static bool isTileDef(MachineRegisterInfo *MRI, MachineInstr &MI) {
+static unsigned getNumDefTiles(MachineRegisterInfo *MRI, MachineInstr &MI) {
// There is no phi instruction after register allocation.
assert(MI.isPHI() == false);
// The instruction must have 3 operands: tile def, row, col.
// It should be AMX pseudo instruction that have shape operand.
if (MI.isDebugInstr() || MI.isCopy() || MI.getNumOperands() < 3 ||
!MI.isPseudo())
- return false;
+ return 0;
MachineOperand &MO = MI.getOperand(0);
if (MO.isReg()) {
Register Reg = MO.getReg();
- // FIXME it may be used after Greedy RA and the physical
+ // FIXME: It may be used after Greedy RA and the physical
// register is not rewritten yet.
- if (Reg.isVirtual() &&
- MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
- return true;
+ if (Reg.isVirtual()) {
+ if (MRI->getRegClass(Reg)->getID() == X86::TILERegClassID)
+ return 1;
+ if (MRI->getRegClass(Reg)->getID() == X86::TILEPAIRRegClassID)
+ return 2;
+ }
if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
- return true;
+ return 1;
+ if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
+ return 2;
}
- return false;
+ return 0;
+}
+
+static unsigned getTMMIndex(Register Reg) {
+ if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+ return Reg - X86::TMM0;
+ if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
+ return (Reg - X86::TMM0_TMM1) * 2;
+ llvm_unreachable("Invalid Tmm Reg!");
}
// PreTileConfig should configure the tile registers based on basic
@@ -110,14 +123,17 @@ bool X86FastTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
bool Change = false;
SmallVector<std::pair<unsigned, ShapeT>, 6> ShapeInfos;
for (MachineInstr &MI : reverse(MBB)) {
- if (!isTileDef(MRI, MI) && MI.getOpcode() != X86::PLDTILECFGV)
+ unsigned DefNum = getNumDefTiles(MRI, MI);
+ if (DefNum == 0 && MI.getOpcode() != X86::PLDTILECFGV)
continue;
// AMX instructions that define tile register.
if (MI.getOpcode() != X86::PLDTILECFGV) {
MachineOperand &Row = MI.getOperand(1);
- MachineOperand &Col = MI.getOperand(2);
- unsigned TMMIdx = MI.getOperand(0).getReg() - X86::TMM0;
- ShapeInfos.push_back({TMMIdx, ShapeT(&Row, &Col)});
+ unsigned TMMIdx = getTMMIndex(MI.getOperand(0).getReg());
+ for (unsigned I = 0; I < DefNum; I++) {
+ MachineOperand &Col = MI.getOperand(2 + I);
+ ShapeInfos.push_back({TMMIdx + I, ShapeT(&Row, &Col)});
+ }
} else { // PLDTILECFGV
// Rewrite the shape information to memory. Stack slot should have
// been initialized to zero in pre config.
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 70e4c199190d63..aea86c280e2f99 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -323,6 +323,35 @@ namespace {
Segment = CurDAG->getRegister(0, MVT::i16);
}
+ // Utility function to determine whether it is AMX SDNode right after
+ // lowering but before ISEL.
+ bool isAMXSDNode(SDNode *N) const {
+ // Check if N is AMX SDNode:
+ // 1. check specific opcode since these carry MVT::Untyped instead of
+ // x86amx_type;
+ // 2. check result type;
+ // 3. check operand type;
+ switch (N->getOpcode()) {
+ default:
+ break;
+ case X86::PT2RPNTLVWZ0V:
+ case X86::PT2RPNTLVWZ0T1V:
+ case X86::PT2RPNTLVWZ1V:
+ case X86::PT2RPNTLVWZ1T1V:
+ return true;
+ }
+ for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
+ if (N->getValueType(Idx) == MVT::x86amx)
+ return true;
+ }
+ for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
+ SDValue Op = N->getOperand(Idx);
+ if (Op.getValueType() == MVT::x86amx)
+ return true;
+ }
+ return false;
+ }
+
// Utility function to determine whether we should avoid selecting
// immediate forms of instructions for better code size or not.
// At a high level, we'd like to avoid such instructions when
@@ -5278,6 +5307,47 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, CNode);
return;
}
+ case Intrinsic::x86_t2rpntlvwz0:
+ case Intrinsic::x86_t2rpntlvwz0t1:
+ case Intrinsic::x86_t2rpntlvwz1:
+ case Intrinsic::x86_t2rpntlvwz1t1: {
+ if (!Subtarget->hasAMXTRANSPOSE())
+ break;
+ auto *MFI =
+ CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
+ unsigned Opc;
+ switch (IntNo) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_t2rpntlvwz0:
+ Opc = X86::PT2RPNTLVWZ0;
+ break;
+ case Intrinsic::x86_t2rpntlvwz0t1:
+ Opc = X86::PT2RPNTLVWZ0T1;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1:
+ Opc = X86::PT2RPNTLVWZ1;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1t1:
+ Opc = X86::PT2RPNTLVWZ1T1;
+ break;
+ }
+ // FIXME: Match displacement and scale.
+ unsigned TIndex = Node->getConstantOperandVal(2);
+ SDValue TReg = getI8Imm(TIndex, dl);
+ SDValue Base = Node->getOperand(3);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(4);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
}
break;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7f4dc12a20837f..676ae6b8761035 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27281,6 +27281,53 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
+ case Intrinsic::x86_t2rpntlvwz0_internal:
+ case Intrinsic::x86_t2rpntlvwz0t1_internal:
+ case Intrinsic::x86_t2rpntlvwz1_internal:
+ case Intrinsic::x86_t2rpntlvwz1t1_internal: {
+ if (!Subtarget.hasAMXTILE())
+ break;
+ auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
+ unsigned IntNo = Op.getConstantOperandVal(1);
+ unsigned Opc = 0;
+ switch (IntNo) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_t2rpntlvwz0_internal:
+ Opc = X86::PT2RPNTLVWZ0V;
+ break;
+ case Intrinsic::x86_t2rpntlvwz0t1_internal:
+ Opc = X86::PT2RPNTLVWZ0T1V;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1_internal:
+ Opc = X86::PT2RPNTLVWZ1V;
+ break;
+ case Intrinsic::x86_t2rpntlvwz1t1_internal:
+ Opc = X86::PT2RPNTLVWZ1T1V;
+ break;
+ }
+
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
+
+ SDValue Ops[] = {Op.getOperand(2), // Row
+ Op.getOperand(3), // Col0
+ Op.getOperand(4), // Col1
+ Op.getOperand(5), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ Op.getOperand(6), // Index
+ DAG.getTargetConstant(0, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment
+ Op.getOperand(0)}; // Chain
+
+ MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
+ SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
+ SDValue(Res, 0));
+ SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
+ SDValue(Res, 0));
+ return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
+ }
case Intrinsic::x86_atomic_bts_rm:
case Intrinsic::x86_atomic_btc_rm:
case Intrinsic::x86_atomic_btr_rm: {
@@ -37029,6 +37076,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
assert (Imm < 8 && "Illegal tmm index");
return X86::TMM0 + Imm;
};
+ auto TMMImmToTMMPair = [](unsigned Imm) {
+ assert(Imm < 8 && "Illegal tmm pair index.");
+ return X86::TMM0_TMM1 + Imm / 2;
+ };
switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TLS_addr32:
@@ -37503,6 +37554,49 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
+ case X86::PT2RPNTLVWZ0:
+ case X86::PT2RPNTLVWZ0T1:
+ case X86::PT2RPNTLVWZ1:
+ case X86::PT2RPNTLVWZ1T1: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction!");
+ case X86::PT2RPNTLVWZ0:
+ Opc = X86::T2RPNTLVWZ0;
+ break;
+ case X86::PT2RPNTLVWZ0T1:
+ Opc = X86::T2RPNTLVWZ0T1;
+ break;
+ case X86::PT2RPNTLVWZ1:
+ Opc = X86::T2RPNTLVWZ1;
+ break;
+ case X86::PT2RPNTLVWZ1T1:
+ Opc = X86::T2RPNTLVWZ1T1;
+ break;
+ }
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
+
+ MIB.add(MI.getOperand(1)); // base
+ MIB.add(MI.getOperand(2)); // scale
+ MIB.add(MI.getOperand(3)); // index
+ MIB.add(MI.getOperand(4)); // displacement
+ MIB.add(MI.getOperand(5)); // segment
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ case X86::PTTRANSPOSED: {
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::TTRANSPOSED));
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
}
}
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 99deacc811a170..1c437e8f03ffb0 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -267,3 +267,66 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in {
}
} // SchedRW = [WriteSystem]
}
+
+let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSystem] in {
+ let mayStore = 1 in
+ def PTILEPAIRSTORE : PseudoI<(outs), (ins opaquemem:$src1, TILEPair:$src2), []>;
+ let mayLoad = 1 in
+ def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>;
+}
+
+let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
+ let SchedRW = [WriteSystem] in {
+ def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}",
+ []>, VEX, WIG, T8,PS;
+
+ def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}",
+ []>, VEX, T8,PS;
+
+ def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}",
+ []>, VEX, T8,PD;
+
+ def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
+ (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}",
+ []>, VEX, T8,PD;
+
+ def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
+ "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS;
+ let isPseudo = true in {
+ def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ def PT2RPNTLVWZ0T1V : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ def PT2RPNTLVWZ1V : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ def PT2RPNTLVWZ1T1V : PseudoI<(outs TILEPair:$dst),
+ (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+ []>;
+ }
+
+ def PTTRANSPOSEDV : PseudoI<(outs TILE:$dst),
+ (ins GR16:$src1, GR16:$src2, TILE:$src),
+ [(set TILE: $dst,
+ (int_x86_ttransposed_internal GR16:$src1, GR16:$src2,
+ TILE:$src))]>;
+
+ let usesCustomInserter = 1 in {
+ def PT2RPNTLVWZ0 : PseudoI<(outs), (ins u8imm:$dst,
+ sibmem:$src1), []>;
+ def PT2RPNTLVWZ0T1 : PseudoI<(outs), (ins u8imm:$dst,
+ sibmem:$src1), []>;
+ def PT2RPNTLVWZ1 : PseudoI<(outs), (ins u8imm:$dst,
+ sibmem:$src1), []>;
+ def PT2RPNTLVWZ1T1 : PseudoI<(outs), (ins u8imm:$dst,
+ sibmem:$src1), []>;
+ def PTTRANSPOSED : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src),
+ [(int_x86_ttransposed timm:$dst, timm:$src)]>;
+ }
+ }
+} // HasAMXTILE, HasAMXTRANSPOSE
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 38ea1f35be2b9a..9b002ebd3a93bc 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4538,6 +4538,11 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
: GET_EGPR_IF_ENABLED(X86::TILESTORED);
#undef GET_EGPR_IF_ENABLED
+ case 2048:
+ assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) &&
+ "Unknown 2048-byte regclass");
+ assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE");
+ return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE;
}
}
@@ -4732,6 +4737,7 @@ static bool isAMXOpcode(unsigned Opc) {
case X86::TILESTORED:
case X86::TILELOADD_EVEX:
case X86::TILESTORED_EVEX:
+ case X86::PTILEPAIRLOAD:
return true;
}
}
@@ -4744,7 +4750,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
default:
llvm_unreachable("Unexpected special opcode!");
case X86::TILESTORED:
- case X86::TILESTORED_EVEX: {
+ case X86::TILESTORED_EVEX:
+ case X86::PTILEPAIRSTORE: {
// tilestored %tmm, (%sp, %idx)
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
@@ -4758,7 +4765,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
break;
}
case X86::TILELOADD:
- case X86::TILELOADD_EVEX: {
+ case X86::TILELOADD_EVEX:
+ case X86::PTILEPAIRLOAD: {
// tileloadd (%sp, %idx), %tmm
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
diff --git a/llvm/lib/Target/X86/X86InstrOperands.td b/llvm/lib/Target/X86/X86InstrOperands.td
index f8f5cd83166e33..2102cb4b6b5b73 100644
--- a/llvm/lib/Target/X86/X86InstrOperands.td
+++ b/llvm/lib/Target/X86/X86InstrOperands.td
@@ -501,3 +501,10 @@ def VK8Pair : RegisterOperand<VK8PAIR, "printVKPair"> {
def VK16Pair : RegisterOperand<VK16PAIR, "printVKPair"> {
let ParserMatchClass = VK16PairAsmOperand;
}
+
+let RenderMethod = "addTILEPairOperands" in
+ def TILEPairAsmOperand : AsmOperandClass { let Name = "TILEPair"; }
+
+def TILEPair : RegisterOperand<TILEPAIR, "printTILEPair"> {
+ let ParserMatchClass = TILEPairAsmOperand;
+}
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index a815ddc9714f0c..14aff7f82710e5 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -182,6 +182,7 @@ def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">;
def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">;
def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
+def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">;
def HasUINTR : Predicate<"Subtarget->hasUINTR()">;
def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">;
def HasCRC32 : Predicate<"Subtarget->hasCRC32()">;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 919e1eb3e38e9a..b9d811e91ab5a3 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -74,6 +74,22 @@ static bool isAMXCast(Instruction *II) {
match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
}
+// Some instructions may return more than one tiles.
+// e.g: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal
+static unsigned getNumDefTiles(IntrinsicInst *II) {
+ Type *Ty = II->getType();
+ if (Ty->isX86_AMXTy())
+ return 1;
+
+ unsigned Num = 0;
+ for (unsigned i = 0; i < Ty->getNumContainedTypes(); i++) {
+ Type *STy = Ty->getContainedType(i);
+ if (STy->isX86_AMXTy())
+ Num++;
+ }
+ return Num;
+}
+
static bool isAMXIntrinsic(Value *I) {
auto *II = dyn_cast<IntrinsicInst>(I);
if (!II)
@@ -82,7 +98,7 @@ static bool isAMXIntrinsic(Value *I) {
return false;
// Check if return type or parameter is x86_amx. If it is x86_amx
// the intrinsic must be x86 amx intrinsics.
- if (II->getType()->isX86_AMXTy())
+ if (getNumDefTiles(II) > 0)
return true;
for (Value *V : II->args()) {
if (V->getType()->isX86_AMXTy())
@@ -121,12 +137,96 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) {
llvm_unreachable("No terminator in the entry block!");
}
-static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
+class ShapeCalculator {
+private:
+ TargetMachine *TM = nullptr;
+
+ // In AMX intrinsics we let Shape = {Row, Col}, but the
+ // RealCol = Col / ElementSize. We may use the RealCol
+ // as a new Row for other new created AMX intrinsics.
+ std::map<Value *, Value *> Col2Row, Row2Col;
+
+public:
+ ShapeCalculator(TargetMachine *TargetM) : TM(TargetM) {}
+ std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
+ std::pair<Value *, Value *> getShape(PHINode *Phi);
+ Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
+ Value *getColFromRow(Instruction *II, Value *V, unsigned Granularity);
+};
+
+Value *ShapeCalculator::getRowFromCol(Instruction *II, Value *V,
+ unsigned Granularity) {
+ if (Col2Row.count(V))
+ return Col2Row[V];
+ IRBuilder<> Builder(II);
+ Value *RealRow = nullptr;
+ if (isa<ConstantInt>(V))
+ RealRow =
+ Builder.getInt16((cast<ConstantInt>(V)->getSExtValue()) / Granularity);
+ else if (isa<Instruction>(V)) {
+ // When it is not a const value and it is not a function argument, we
+ // create Row after the definition of V instead of
+ // before II. For example, II is %118, we try to getshape for %117:
+ // %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x
+ // i32> %115).
+ // %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16
+ // %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx
+ // %117).
+ // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its
+ // definition is after its user(new tileload for %117).
+ // So, the best choice is to create %row right after the definition of
+ // %106.
+ Builder.SetInsertPoint(cast<Instruction>(V));
+ RealRow = Builder.CreateUDiv(V, Builder.getInt16(4));
+ cast<Instruction>(RealRow)->moveAfter(cast<Instruction>(V));
+ } else {
+ // When it is not a const value and it is a function argument, we create
+ // Row at the entry bb.
+ IRBuilder<> NewBuilder(
+ getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
+ RealRow = NewBuilder.CreateUDiv(V, NewBuilder.getInt16(Granularity));
+ }
+ Col2Row[V] = RealRow;
+ return RealRow;
+}
+
+Value *ShapeCalculator::getColFromRow(Instruction *II, Value *V,
+ unsigned Granularity) {
+ if (Row2Col.count(V))
+ return Row2Col[V];
+ IRBuilder<> Builder(II);
+ Value *RealCol = nullptr;
+ if (isa<ConstantInt>(V))
+ RealCol =
+ Builder.getInt16((cast<ConstantInt>(V)->getSExtValue()) * Granularity);
+ else if (isa<Instruction>(V)) {
+ Builder.SetInsertPoint(cast<Instruction>(V));
+ RealCol = Builder.CreateNUWMul(V, Builder.getInt16(Granularity));
+ cast<Instruction>(RealCol)->moveAfter(cast<Instruction>(V));
+ } else {
+ // When it is not a const value and it is a function argument, we create
+ // Row at the entry bb.
+ IRBuilder<> NewBuilder(
+ getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
+ RealCol = NewBuilder.CreateNUWMul(V, NewBuilder.getInt16(Granularity));
+ }
+ Row2Col[V] = RealCol;
+ return RealCol;
+}
+
+// TODO: Refine the row and col-in-bytes of tile to row and col of matrix.
+std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
+ unsigned OpNo) {
+ (void)TM;
IRBuilder<> Builder(II);
Value *Row = nullptr, *Col = nullptr;
switch (II->getIntrinsicID()) {
default:
llvm_unreachable("Expect amx intrinsics");
+ case Intrinsic::x86_t2rpntlvwz0_internal:
+ case Intrinsic::x86_t2rpntlvwz0t1_internal:
+ case Intrinsic::x86_t2rpntlvwz1_internal:
+ case Intrinsic::x86_t2rpntlvwz1t1_internal:
case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tileloaddt164_internal:
case Intrinsic::x86_tilestored64_internal: {
@@ -154,43 +254,24 @@ static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
Col = II->getArgOperand(2);
break;
case 5:
- if (isa<ConstantInt>(II->getArgOperand(2)))
- Row = Builder.getInt16(
- (cast<ConstantInt>(II->getOperand(2))->getSExtValue()) / 4);
- else if (isa<Instruction>(II->getArgOperand(2))) {
- // When it is not a const value and it is not a function argument, we
- // create Row after the definition of II->getOperand(2) instead of
- // before II. For example, II is %118, we try to getshape for %117:
- // %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x
- // i32> %115).
- // %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16
- // %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx
- // %117).
- // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its
- // definition is after its user(new tileload for %117).
- // So, the best choice is to create %row right after the definition of
- // %106.
- Builder.SetInsertPoint(cast<Instruction>(II->getOperand(2)));
- Row = Builder.CreateUDiv(II->getOperand(2), Builder.getInt16(4));
- cast<Instruction>(Row)->moveAfter(cast<Instruction>(II->getOperand(2)));
- } else {
- // When it is not a const value and it is a function argument, we create
- // Row at the entry bb.
- IRBuilder<> NewBuilder(
- getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
- Row = NewBuilder.CreateUDiv(II->getOperand(2), NewBuilder.getInt16(4));
- }
+ Row = getRowFromCol(II, II->getArgOperand(2), 4);
Col = II->getArgOperand(1);
break;
}
break;
}
+ case Intrinsic::x86_ttransposed_internal: {
+ assert((OpNo == 2) && "Illegal Operand Number.");
+ Row = getRowFromCol(II, II->getArgOperand(1), 4);
+ Col = getColFromRow(II, II->getArgOperand(0), 4);
+ break;
+ }
}
return std::make_pair(Row, Col);
}
-static std::pair<Value *, Value *> getShape(PHINode *Phi) {
+std::pair<Value *, Value *> ShapeCalculator::getShape(PHINode *Phi) {
Use &U = *(Phi->use_begin());
unsigned OpNo = U.getOperandNo();
User *V = U.getUser();
@@ -223,14 +304,15 @@ static std::pair<Value *, Value *> getShape(PHINode *Phi) {
namespace {
class X86LowerAMXType {
Function &Func;
+ ShapeCalculator *SC;
// In AMX intrinsics we let Shape = {Row, Col}, but the
// RealCol = Col / ElementSize. We may use the RealCol
// as a new Row for other new created AMX intrinsics.
- std::map<Value *, Value *> Col2Row;
+ std::map<Value *, Value *> Col2Row, Row2Col;
public:
- X86LowerAMXType(Function &F) : Func(F) {}
+ X86LowerAMXType(Function &F, ShapeCalculator *ShapeC) : Func(F), SC(ShapeC) {}
bool visit();
void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
@@ -247,7 +329,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
Use &U = *(Bitcast->use_begin());
unsigned OpNo = U.getOperandNo();
auto *II = cast<IntrinsicInst>(U.getUser());
- std::tie(Row, Col) = getShape(II, OpNo);
+ std::tie(Row, Col) = SC->getShape(II, OpNo);
IRBuilder<> Builder(Bitcast);
// Use the maximun column as stride.
Value *Stride = Builder.getInt64(64);
@@ -327,7 +409,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
Builder.CreateStore(Src, AllocaAddr);
// TODO we can pick an constant operand for the shape.
Value *Row = nullptr, *Col = nullptr;
- std::tie(Row, Col) = getShape(II, OpNo);
+ std::tie(Row, Col) = SC->getShape(II, OpNo);
std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
Value *NewInst =
Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, {}, Args);
@@ -467,10 +549,18 @@ static Value *getAllocaPos(BasicBlock *BB) {
static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
- auto *II = cast<IntrinsicInst>(TileDef);
+ auto *II = dyn_cast<IntrinsicInst>(TileDef);
+ unsigned Idx = 0;
+ // Extract tile from mult tiles' def.
+ if (auto *Extr = dyn_cast<ExtractValueInst>(TileDef)) {
+ assert(Extr->hasIndices() && "Tile extract miss index!");
+ Idx = Extr->getIndices()[0];
+ II = cast<IntrinsicInst>(Extr->getOperand(0));
+ }
+
assert(II && "Not tile intrinsic!");
- Value *Row = II->getOperand(0);
- Value *Col = II->getOperand(1);
+ Value *Row = II->getOperand(Idx);
+ Value *Col = II->getOperand(Idx + 1);
BasicBlock *BB = TileDef->getParent();
BasicBlock::iterator Iter = TileDef->getIterator();
@@ -489,14 +579,20 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
// Get tile shape.
IntrinsicInst *II = nullptr;
+ unsigned Idx = 0;
if (IsPHI) {
Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0);
II = cast<IntrinsicInst>(PhiOp);
+ } else if (auto *Extr = dyn_cast<ExtractValueInst>(V)) {
+ // Extract tile from mult tiles' def.
+ assert(Extr->hasIndices() && "Tile extract miss index!");
+ Idx = Extr->getIndices()[0];
+ II = cast<IntrinsicInst>(Extr->getOperand(0));
} else {
II = cast<IntrinsicInst>(V);
}
- Value *Row = II->getOperand(0);
- Value *Col = II->getOperand(1);
+ Value *Row = II->getOperand(Idx);
+ Value *Col = II->getOperand(Idx + 1);
Instruction *UserI = cast<Instruction>(U.getUser());
IRBuilder<> Builder(UserI);
@@ -707,10 +803,12 @@ namespace {
class X86LowerAMXCast {
Function &Func;
+ ShapeCalculator *SC;
std::unique_ptr<DominatorTree> DT;
public:
- X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {}
+ X86LowerAMXCast(Function &F, ShapeCalculator *ShapeC)
+ : Func(F), SC(ShapeC), DT(nullptr) {}
bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
bool combineLdSt(SmallVectorImpl<Instruction *> &Casts);
@@ -788,7 +886,7 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
if (!isa<UndefValue>(IncValue) && !IncConst->isZeroValue())
return false;
Value *Row = nullptr, *Col = nullptr;
- std::tie(Row, Col) = getShape(OldPN);
+ std::tie(Row, Col) = SC->getShape(OldPN);
// TODO: If it is not constant the Row and Col must domoniate tilezero
// that we are going to create.
if (!Row || !Col || !isa<Constant>(Row) || !isa<Constant>(Col))
@@ -919,6 +1017,19 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
return true;
}
+static Value *getShapeFromAMXIntrinsic(Value *Inst, unsigned ShapeIdx,
+ bool IsRow) {
+ if (!isAMXIntrinsic(Inst))
+ return nullptr;
+
+ auto *II = cast<IntrinsicInst>(Inst);
+ if (IsRow)
+ return II->getOperand(0);
+
+ assert(ShapeIdx < 2 && "Currently 2 shapes in 1 instruction at most!");
+ return II->getOperand(ShapeIdx + 1);
+}
+
// %43 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %42)
// store <256 x i32> %43, <256 x i32>* %p, align 64
// -->
@@ -926,16 +1037,46 @@ bool X86LowerAMXCast::optimizeAMXCastFromPhi(
// i64 64, x86_amx %42)
bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
Value *Tile = Cast->getOperand(0);
- // TODO: If it is cast intrinsic or phi node, we can propagate the
- // shape information through def-use chain.
- if (!isAMXIntrinsic(Tile))
+
+ assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!");
+
+ // TODO: Specially handle the mult-use case.
+ if (Tile->getNumUses() != 1)
return false;
- auto *II = cast<IntrinsicInst>(Tile);
- // Tile is output from AMX intrinsic. The first operand of the
- // intrinsic is row, the second operand of the intrinsic is column.
- Value *Row = II->getOperand(0);
- Value *Col = II->getOperand(1);
+
+ // We don't fetch shape from tilestore, we only get shape from tiledef,
+ // so we can set the max tile shape to tilestore for special cases.
IRBuilder<> Builder(ST);
+ Value *Row = nullptr;
+ Value *Col = nullptr;
+
+ if (isAMXIntrinsic(Tile)) {
+ auto *II = cast<IntrinsicInst>(Tile);
+ // Tile is output from AMX intrinsic. The first operand of the
+ // intrinsic is row, the second operand of the intrinsic is column.
+ Row = II->getOperand(0);
+ Col = II->getOperand(1);
+ } else {
+ // Now we supported mult-tiles value in structure, so we may get tile
+ // from extracting mult-tiles structure.
+ // For example:
+ // %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1,
+ // i16 %2, i16 %3, i8* %4, i64 %5)
+ // %7 = extractvalue { x86_amx, x86_amx } %6, 0
+ // %8 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %7)
+ // store <256 x i32> %8, <256 x i32>* %0, align 1024
+ //
+ // TODO: Currently we only handle extractvalue case, enhance me for other
+ // cases if possible.
+ auto *II = cast<ExtractValueInst>(Tile);
+ assert(II && "We meet unhandle source in fetching tile value!");
+ unsigned ShapeIdx = II->getIndices()[0];
+ Value *Tiles = II->getOperand(0);
+ Row = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, true);
+ Col = getShapeFromAMXIntrinsic(Tiles, ShapeIdx, false);
+ }
+ assert(Row && Col && "Shape got failed!");
+
// Stride should be equal to col(measured by bytes)
Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
Value *I8Ptr = Builder.CreateBitCast(ST->getOperand(1), Builder.getPtrTy());
@@ -959,7 +1100,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
// shape information through def-use chain.
if (!isAMXIntrinsic(II))
return false;
- std::tie(Row, Col) = getShape(II, OpNo);
+ std::tie(Row, Col) = SC->getShape(II, OpNo);
IRBuilder<> Builder(LD);
// Stride should be equal to col(measured by bytes)
Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty());
@@ -1169,7 +1310,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
Builder.CreateStore(Src, AllocaAddr);
// TODO we can pick an constant operand for the shape.
Value *Row = nullptr, *Col = nullptr;
- std::tie(Row, Col) = getShape(II, OpNo);
+ std::tie(Row, Col) = SC->getShape(II, OpNo);
std::array<Value *, 4> Args = {
Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
Value *NewInst =
@@ -1245,13 +1386,14 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass {
TargetLibraryInfo *TLI =
&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- X86LowerAMXCast LAC(F);
+ ShapeCalculator SC(TM);
+ X86LowerAMXCast LAC(F, &SC);
C |= LAC.combineAMXcast(TLI);
// There might be remaining AMXcast after combineAMXcast and they should be
// handled elegantly.
C |= LAC.transformAllAMXCast();
- X86LowerAMXType LAT(F);
+ X86LowerAMXType LAT(F, &SC);
C |= LAT.visit();
// Prepare for fast register allocation at O0.
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
index 1d1885a3dcd246..d20bfdcdb7f9c1 100644
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -118,16 +118,27 @@ class X86PreTileConfig : public MachineFunctionPass {
bool isAMXInstruction(MachineInstr &MI) {
if (MI.isPHI() || MI.isDebugInstr() || MI.getNumOperands() < 3)
return false;
- MachineOperand &MO = MI.getOperand(0);
+
+ // PTILESTOREDV is the only exception that doesn't def a AMX register.
+ if (MI.getOpcode() == X86::PTILESTOREDV)
+ return true;
+
// We can simply check if it is AMX instruction by its def.
// But we should exclude old API which uses physical registers.
- if (MO.isReg() && MO.getReg().isVirtual() &&
- MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID) {
- collectShapeInfo(MI);
- return true;
- }
- // PTILESTOREDV is the only exception that doesn't def a AMX register.
- return MI.getOpcode() == X86::PTILESTOREDV;
+ MachineOperand &MO = MI.getOperand(0);
+ if (!MO.isReg() || !MO.getReg().isVirtual())
+ return false;
+
+ unsigned Shapes = 0;
+ if (MRI->getRegClass(MO.getReg())->getID() == X86::TILERegClassID)
+ Shapes = 1;
+ if (MRI->getRegClass(MO.getReg())->getID() == X86::TILEPAIRRegClassID)
+ Shapes = 2;
+ if (!Shapes)
+ return false;
+
+ collectShapeInfo(MI, Shapes);
+ return true;
}
/// Check if it is an edge from loop bottom to loop head.
@@ -142,7 +153,7 @@ class X86PreTileConfig : public MachineFunctionPass {
}
/// Collect the shape def information for later use.
- void collectShapeInfo(MachineInstr &MI);
+ void collectShapeInfo(MachineInstr &MI, unsigned Shapes);
/// Try to hoist shapes definded below AMX instructions.
bool hoistShapesInBB(MachineBasicBlock *MBB, SmallVectorImpl<MIRef> &Shapes) {
@@ -208,7 +219,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
"Tile Register Pre-configure", false, false)
-void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
+void X86PreTileConfig::collectShapeInfo(MachineInstr &MI, unsigned Shapes) {
auto RecordShape = [&](MachineInstr *MI, MachineBasicBlock *MBB) {
MIRef MIR(MI, MBB);
auto I = llvm::lower_bound(ShapeBBs[MBB], MIR);
@@ -216,8 +227,10 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
ShapeBBs[MBB].insert(I, MIR);
};
- SmallVector<Register, 8> WorkList(
- {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()});
+ // All shapes have same row in multi-tile operand.
+ SmallVector<Register, 8> WorkList;
+ for (unsigned I = 1; I < Shapes + 2; ++I)
+ WorkList.push_back(MI.getOperand(I).getReg());
while (!WorkList.empty()) {
Register R = WorkList.pop_back_val();
MachineInstr *DefMI = MRI->getVRegDef(R);
@@ -225,6 +238,14 @@ void X86PreTileConfig::collectShapeInfo(MachineInstr &MI) {
MachineBasicBlock *DefMBB = DefMI->getParent();
if (DefMI->isMoveImmediate() || !DefVisited.insert(DefMI).second)
continue;
+
+ // This happens when column = 0 in multi-tile operand.
+ if (DefMI->getOpcode() == X86::COPY) {
+ MachineInstr *MI = MRI->getVRegDef(DefMI->getOperand(1).getReg());
+ if (MI && MI->isMoveImmediate())
+ continue;
+ }
+
if (DefMI->isPHI()) {
for (unsigned I = 1; I < DefMI->getNumOperands(); I += 2)
if (isLoopBackEdge(DefMBB, DefMI->getOperand(I + 1).getMBB()))
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 302d50581e1e6b..2daaa95b06be0d 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -642,6 +642,10 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(*AI);
}
+ // Reserve low half pair registers in case they are used by RA aggressively.
+ Reserved.set(X86::TMM0_TMM1);
+ Reserved.set(X86::TMM2_TMM3);
+
assert(checkAllSuperRegsMarked(Reserved,
{X86::SIL, X86::DIL, X86::BPL, X86::SPL,
X86::SIH, X86::DIH, X86::BPH, X86::SPH}));
@@ -662,7 +666,7 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
// and try to return the minimum number of registers supported by the target.
static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
(X86::K6_K7 + 1 == X86::TMMCFG) &&
- (X86::TMM7 + 1 == X86::R16) &&
+ (X86::TMM6_TMM7 + 1 == X86::R16) &&
(X86::R31WH + 1 == X86::NUM_TARGET_REGS),
"Register number may be incorrect");
@@ -735,7 +739,8 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF,
}
bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const {
- return RC->getID() == X86::TILERegClassID;
+ return RC->getID() == X86::TILERegClassID ||
+ RC->getID() == X86::TILEPAIRRegClassID;
}
void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
@@ -1073,12 +1078,59 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
case X86::PTDPFP16PSV:
case X86::PTCMMIMFP16PSV:
case X86::PTCMMRLFP16PSV:
+ case X86::PTTRANSPOSEDV: {
MachineOperand &MO1 = MI->getOperand(1);
MachineOperand &MO2 = MI->getOperand(2);
ShapeT Shape(&MO1, &MO2, MRI);
VRM->assignVirt2Shape(VirtReg, Shape);
return Shape;
}
+ case X86::PT2RPNTLVWZ0V:
+ case X86::PT2RPNTLVWZ0T1V:
+ case X86::PT2RPNTLVWZ1V:
+ case X86::PT2RPNTLVWZ1T1V: {
+ MachineOperand &MO1 = MI->getOperand(1);
+ MachineOperand &MO2 = MI->getOperand(2);
+ MachineOperand &MO3 = MI->getOperand(3);
+ ShapeT Shape({&MO1, &MO2, &MO1, &MO3}, MRI);
+ VRM->assignVirt2Shape(VirtReg, Shape);
+ return Shape;
+ }
+ }
+}
+
+static bool canHintShape(ShapeT &PhysShape, ShapeT &VirtShape) {
+ unsigned PhysShapeNum = PhysShape.getShapeNum();
+ unsigned VirtShapeNum = VirtShape.getShapeNum();
+
+ if (PhysShapeNum < VirtShapeNum)
+ return false;
+
+ if (PhysShapeNum == VirtShapeNum) {
+ if (PhysShapeNum == 1)
+ return PhysShape == VirtShape;
+
+ for (unsigned I = 0; I < PhysShapeNum; I++) {
+ ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
+ ShapeT VShape(VirtShape.getRow(I), VirtShape.getCol(I));
+ if (VShape != PShape)
+ return false;
+ }
+ return true;
+ }
+
+ // Hint subreg of mult-tile reg to single tile reg.
+ if (VirtShapeNum == 1) {
+ for (unsigned I = 0; I < PhysShapeNum; I++) {
+ ShapeT PShape(PhysShape.getRow(I), PhysShape.getCol(I));
+ if (VirtShape == PShape)
+ return true;
+ }
+ }
+
+ // Note: Currently we have no requirement for case of
+ // (VirtShapeNum > 1 and PhysShapeNum > VirtShapeNum)
+ return false;
}
bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
@@ -1099,7 +1151,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
if (!VRM)
return BaseImplRetVal;
- if (ID != X86::TILERegClassID) {
+ if (ID != X86::TILERegClassID && ID != X86::TILEPAIRRegClassID) {
if (DisableRegAllocNDDHints || !ST.hasNDD() ||
!TRI.isGeneralPurposeRegisterClass(&RC))
return BaseImplRetVal;
@@ -1151,7 +1203,7 @@ bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
return;
}
ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
- if (PhysShape == VirtShape)
+ if (canHintShape(PhysShape, VirtShape))
Hints.push_back(PhysReg);
};
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 166024bf3b53fe..19a0b37d06a2a5 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -30,6 +30,8 @@ let Namespace = "X86" in {
def sub_ymm : SubRegIndex<256>;
def sub_mask_0 : SubRegIndex<-1>;
def sub_mask_1 : SubRegIndex<-1, -1>;
+ def sub_t0 : SubRegIndex<8192>;
+ def sub_t1 : SubRegIndex<8192, 8192>;
}
//===----------------------------------------------------------------------===//
@@ -431,6 +433,10 @@ def TMM5: X86Reg<"tmm5", 5>;
def TMM6: X86Reg<"tmm6", 6>;
def TMM7: X86Reg<"tmm7", 7>;
}
+// TMM register pairs
+def TPAIRS : RegisterTuples<[sub_t0, sub_t1],
+ [(add TMM0, TMM2, TMM4, TMM6),
+ (add TMM1, TMM3, TMM5, TMM7)]>;
// Floating point stack registers. These don't map one-to-one to the FP
// pseudo registers, but we still mark them as aliasing FP registers. That
@@ -835,6 +841,9 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
let CopyCost = -1 in // Don't allow copying of tile registers
def TILE : RegisterClass<"X86", [x86amx], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
+// Need check alignment 3rd operand size=1024*2*8
+let isAllocatable = 1 in
+def TILEPAIR : RegisterClass<"X86", [untyped], 512, (add TPAIRS)> {let Size = 16384;}
//===----------------------------------------------------------------------===//
// Register categories.
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
index 2250c3912a90d4..95a84c2cda5369 100644
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -76,6 +76,63 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false,
false)
+unsigned getAMXRegNum(MachineRegisterInfo *MRI, Register Reg) {
+ if (Reg.isVirtual()) {
+ unsigned RegClassID = MRI->getRegClass(Reg)->getID();
+ if (RegClassID == X86::TILERegClassID)
+ return 1;
+ if (RegClassID == X86::TILEPAIRRegClassID)
+ return 2;
+ } else {
+ if (Reg >= X86::TMM0 && Reg <= X86::TMM7)
+ return 1;
+ if (Reg >= X86::TMM0_TMM1 && Reg <= X86::TMM6_TMM7)
+ return 2;
+ }
+ return 0;
+}
+
+static void collectVirtRegShapes(MachineRegisterInfo *MRI, VirtRegMap &VRM,
+ Register VirtReg,
+ SmallVector<ShapeT, 8> &Phys2Shapes) {
+ unsigned Num = getAMXRegNum(MRI, VirtReg);
+ MCRegister PhysReg = VRM.getPhys(VirtReg);
+ if (!PhysReg)
+ return;
+
+ if (Num == 1) {
+ unsigned Index = PhysReg - X86::TMM0;
+ if (!Phys2Shapes[Index].isValid()) {
+ ShapeT Shape = VRM.getShape(VirtReg);
+ Phys2Shapes[Index] = Shape;
+ return;
+ }
+ }
+ // Split tile pair shape info to 2 single tile shape info. e.g:
+ // Put TMM0_TMM1's Shape to TMM0's shape + TMM1's Shape in Phys2Shapes.
+ if (Num == 2) {
+ unsigned Index0 = (PhysReg - X86::TMM0_TMM1) * 2;
+ unsigned Index1 = (PhysReg - X86::TMM0_TMM1) * 2 + 1;
+
+ ShapeT Shape = VRM.getShape(VirtReg);
+ assert(Shape.getShapeNum() == 2 && "Unexpected shape number!");
+
+ if (!Phys2Shapes[Index0].isValid()) {
+ ShapeT Shape0(Shape.getRow(0), Shape.getCol(0), MRI);
+ Phys2Shapes[Index0] = Shape0;
+ }
+
+ if (!Phys2Shapes[Index1].isValid()) {
+ ShapeT Shape1(Shape.getRow(1), Shape.getCol(1), MRI);
+ Phys2Shapes[Index1] = Shape1;
+ }
+ }
+}
+
+static bool isAMXRegClass(MachineRegisterInfo *MRI, Register Reg) {
+ return getAMXRegNum(MRI, Reg) > 0;
+}
+
bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
// Early exit in the common case of non-AMX code.
@@ -121,29 +178,24 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
assert(ConstMI && "Cannot find an insertion point");
unsigned AMXRegNum = TRI->getRegClass(X86::TILERegClassID)->getNumRegs();
- SmallVector<Register, 8> Phys2Virt(AMXRegNum, 0);
+ SmallVector<ShapeT, 8> Phys2Shapes(AMXRegNum, ShapeT());
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
Register VirtReg = Register::index2VirtReg(I);
if (MRI.reg_nodbg_empty(VirtReg))
continue;
- if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID)
- continue;
- MCRegister PhysReg = VRM.getPhys(VirtReg);
- if (!PhysReg)
+ if (!isAMXRegClass(&MRI, VirtReg))
continue;
- unsigned Index = PhysReg - X86::TMM0;
- if (!Phys2Virt[Index])
- Phys2Virt[Index] = VirtReg;
+ collectVirtRegShapes(&MRI, VRM, VirtReg, Phys2Shapes);
}
// Fill in the shape of each tile physical register.
for (unsigned I = 0; I < AMXRegNum; ++I) {
- if (!Phys2Virt[I])
+ ShapeT Shape = Phys2Shapes[I];
+ if (!Shape.isValid())
continue;
DebugLoc DL;
bool IsRow = true;
MachineInstr *NewMI = nullptr;
- ShapeT Shape = VRM.getShape(Phys2Virt[I]);
for (auto &R : {Shape.getRow()->getReg(), Shape.getCol()->getReg()}) {
// Here is the data format for the tile config.
// 0 palette
@@ -172,7 +224,15 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
"Cannot initialize with different shapes");
continue;
}
- Imm = DefMI.getOperand(1).getImm();
+ if (DefMI.getOperand(1).isImm()) {
+ Imm = DefMI.getOperand(1).getImm();
+ } else {
+ assert(DefMI.getOpcode() == X86::MOV32r0 &&
+ "The opcode is assumed to be MOV32r0 if the operand is not "
+ "immediate.");
+ Imm = 0;
+ }
+
NewMI = addFrameReference(
BuildMI(MF.front(), ++ConstMI->getIterator(), DL,
TII->get(IsRow ? X86::MOV8mi : X86::MOV16mi)),
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 1f608f47ef79f4..d48b5029dc740b 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1875,6 +1875,10 @@ const StringMap<bool> sys::getHostCPUFeatures() {
MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX);
Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1);
+ bool HasLeaf1E =
+ MaxLevel >= 0x1e && !getX86CpuIDAndInfo(0x1e, &EAX, &EBX, &ECX, &EDX);
+ Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
+
bool HasLeaf24 =
MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 09d4312918acfe..e022734eb69fff 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -598,6 +598,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
+constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesHRESET = {};
constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {};
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
new file mode 100644
index 00000000000000..4f41410010302e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
+; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s
+
+ at buf = dso_local global [2048 x i8] zeroinitializer, align 16
+ at buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
+
+define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
+; CHECK-LABEL: test_tile_2rpntlvwz0:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000
+; CHECK-NEXT: .cfi_offset %rbx, -24
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
+; CHECK-NEXT: movw %si, %cx
+; CHECK-NEXT: movw %di, %ax
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: # implicit-def: $cl
+; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: # implicit-def: $al
+; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $buf, %esi
+; CHECK-NEXT: movl $32, %edi
+; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4
+; CHECK-NEXT: movabsq $64, %rbx
+; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill
+; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload
+; CHECK-NEXT: movabsq $64, %rbx
+; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill
+; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload
+; CHECK-NEXT: movl $64, %edi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi)
+; CHECK-NEXT: movl $64, %edi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
+; CHECK-NEXT: tilezero %tmm0
+; CHECK-NEXT: movl $64, %edi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
+; CHECK-NEXT: movl $64, %edi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1
+; CHECK-NEXT: movl $64, %edi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2
+; CHECK-NEXT: movl $64, %edi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
+; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; CHECK-NEXT: movl $64, %edi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
+; CHECK-NEXT: movl $64, %edi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
+; CHECK-NEXT: movl $buf2, %edx
+; CHECK-NEXT: movl $32, %esi
+; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; CHECK-NEXT: leaq -8(%rbp), %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3
+ %1 = extractvalue { x86_amx, x86_amx } %0, 0
+ %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3
+ %3 = extractvalue { x86_amx, x86_amx } %0, 1
+ %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3
+ %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3
+ %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3
+ %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3
+ %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3
+ %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3
+ %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3
+ %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3
+ %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3
+ tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3
+ ret void
+}
+
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
+
+declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
+
+declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
+
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
+
+declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
+
+declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
+
+attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" }
+attributes #1 = { argmemonly nofree nounwind readonly }
+attributes #2 = { nofree nosync nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { argmemonly nounwind writeonly }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 2}
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
new file mode 100644
index 00000000000000..dc79134321e9c2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
@@ -0,0 +1,165 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
+# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s
+
+---
+name: test_tile_2rpntlvwz0
+alignment: 16
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+failsVerification: false
+tracksDebugUserValues: false
+registers: []
+liveins:
+ - { reg: '$edi', virtual-reg: '' }
+ - { reg: '$esi', virtual-reg: '' }
+ - { reg: '$edx', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1024
+ adjustsStack: false
+ hasCalls: true
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ amxProgModel: ManagedRA
+body: |
+ bb.0.entry:
+ liveins: $rdi, $rsi, $rdx, $rax
+
+ ; CHECK-LABEL: name: test_tile_2rpntlvwz0
+ ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0
+ ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
+ ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
+ ; CHECK-NEXT: renamable $rcx = MOV32ri64 64
+ ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
+ ; CHECK-NEXT: renamable $cx = MOV16ri 64
+ ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
+ ; CHECK-NEXT: renamable $cx = MOV16ri 16
+ ; CHECK-NEXT: renamable $r8w = MOV16ri 16
+ ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
+ ; CHECK-NEXT: $al = IMPLICIT_DEF
+ ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
+ ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
+ ; CHECK-NEXT: $al = IMPLICIT_DEF
+ ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4)
+ ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4)
+ ; CHECK-NEXT: $al = IMPLICIT_DEF
+ ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4)
+ ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4)
+ ; CHECK-NEXT: $al = IMPLICIT_DEF
+ ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
+ ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
+ ; CHECK-NEXT: $al = IMPLICIT_DEF
+ ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
+ ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
+ ; CHECK-NEXT: $al = IMPLICIT_DEF
+ ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4)
+ ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4)
+ ; CHECK-NEXT: $al = IMPLICIT_DEF
+ ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4)
+ ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4)
+ ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
+ ; CHECK-NEXT: renamable $r9 = COPY $rsi
+ ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
+ ; CHECK-NEXT: renamable $r8 = COPY $rdi
+ ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
+ ; CHECK-NEXT: renamable $r10 = COPY $rax
+ ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
+ ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
+ ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5
+ ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
+ ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
+ ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
+ ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
+ ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
+ ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
+ ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
+ ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
+ ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
+ ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+ renamable $zmm0 = AVX512_512_SET0
+ VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
+ MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
+ renamable $rcx = MOV32ri64 64
+ MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
+ renamable $cx = MOV16ri 64
+ MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
+ renamable $cx = MOV16ri 16
+ renamable $r8w = MOV16ri 16
+ MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
+ PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
+ renamable $r9 = COPY $rsi
+ $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
+ renamable $r8 = COPY $rdi
+ $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
+ renamable $r10 = COPY $rax
+ $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
+ renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
+ renamable $tmm0 = COPY renamable $tmm5
+ renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
+ PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
+ PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
+ renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
+ PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
+ renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
+ renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
+ renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
+ renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
+ PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
new file mode 100644
index 00000000000000..e62a52162d5238
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
@@ -0,0 +1,153 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
+# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s
+
+--- |
+ @buf = dso_local global [2048 x i8] zeroinitializer, align 16
+ @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
+
+ define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
+ entry:
+ %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5
+ %1 = extractvalue { x86_amx, x86_amx } %0, 0
+ %2 = extractvalue { x86_amx, x86_amx } %0, 1
+ %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
+ %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5
+ tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5
+ ret void
+ }
+
+ declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1
+
+ declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
+
+ declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
+
+ declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
+
+ declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
+
+ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4
+
+ attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
+ attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #5 = { nounwind }
+
+...
+---
+name: test_tile_2rpntlvwz0
+alignment: 16
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: gr32, preferred-register: '' }
+ - { id: 1, class: gr32, preferred-register: '' }
+ - { id: 2, class: gr32, preferred-register: '' }
+ - { id: 3, class: gr16, preferred-register: '' }
+ - { id: 4, class: gr16, preferred-register: '' }
+ - { id: 5, class: gr16, preferred-register: '' }
+ - { id: 6, class: gr64, preferred-register: '' }
+ - { id: 7, class: gr64_nosp, preferred-register: '' }
+ - { id: 8, class: tilepair, preferred-register: '' }
+ - { id: 9, class: tile, preferred-register: '' }
+ - { id: 10, class: tile, preferred-register: '' }
+ - { id: 11, class: tile, preferred-register: '' }
+ - { id: 12, class: tile, preferred-register: '' }
+ - { id: 13, class: gr64, preferred-register: '' }
+ - { id: 14, class: vr512, preferred-register: '' }
+liveins:
+ - { reg: '$edi', virtual-reg: '%0' }
+ - { reg: '$esi', virtual-reg: '%1' }
+ - { reg: '$edx', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ amxProgModel: ManagedRA
+body: |
+ bb.0.entry:
+ liveins: $edi, $esi, $edx
+
+
+ ; CHECK-LABEL: name: test_tile_2rpntlvwz0
+ ; CHECK: liveins: $edi, $esi, $edx
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
+ ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
+ ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
+ ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
+ ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4)
+ ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4)
+ ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4)
+ ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4)
+ ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4)
+ ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4)
+ ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
+ ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf
+ ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32
+ ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg
+ ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit
+ ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1
+ ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2
+ ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]]
+ ; CHECK-NEXT: RET 0
+ %2:gr32 = COPY $edx
+ %1:gr32 = COPY $esi
+ %0:gr32 = COPY $edi
+ %14:vr512 = AVX512_512_SET0
+ VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4)
+ MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
+ PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
+ %6:gr64 = MOV32ri64 @buf
+ %7:gr64_nosp = MOV32ri64 32
+ %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg
+ %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit
+ %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1
+ %13:gr64 = MOV32ri64 @buf2
+ PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12
+ RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
new file mode 100644
index 00000000000000..857ad433af153f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
@@ -0,0 +1,97 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
+# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s
+
+---
+name: test_tile_2rpntlvwz0
+alignment: 16
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+failsVerification: false
+tracksDebugUserValues: false
+registers: []
+liveins:
+ - { reg: '$edi', virtual-reg: '' }
+ - { reg: '$esi', virtual-reg: '' }
+ - { reg: '$edx', virtual-reg: '' }
+ - { reg: '$cx', virtual-reg: '' }
+ - { reg: '$r9', virtual-reg: '' }
+ - { reg: '$r10', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1024
+ adjustsStack: false
+ hasCalls: true
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ amxProgModel: ManagedRA
+body: |
+ bb.0.entry:
+ liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
+
+
+ ; CHECK-LABEL: name: test_tile_2rpntlvwz0
+ ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
+ ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
+ ; CHECK-NEXT: $rax = MOV64ri 64
+ ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3)
+ ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3)
+ ; CHECK-NEXT: $rax = MOV64ri 64
+ ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2)
+ ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2)
+ ; CHECK-NEXT: renamable $r8 = MOV32ri64 64
+ ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1)
+ ; CHECK-NEXT: renamable $di = MOV16ri 64
+ ; CHECK-NEXT: renamable $cx = MOV16ri 16
+ ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
+ ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
+ PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4)
+ renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
+ renamable $tmm0 = COPY renamable $tmm5
+ renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
+ renamable $r8 = MOV32ri64 64
+ MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68)
+ renamable $di = MOV16ri 64
+ renamable $cx = MOV16ri 16
+ PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
+ PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
+
+...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
new file mode 100644
index 00000000000000..52641c65c90e98
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+ ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+
+ @buf = dso_local global [2048 x i8] zeroinitializer, align 16
+
+ ; Function Attrs: noinline nounwind optnone uwtable
+ define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 {
+; CHECK-LABEL: @test_tile_2rpntlvwz0(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]])
+; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
+; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]])
+; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]])
+; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]])
+; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]])
+; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]]
+; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]])
+; CHECK-NEXT: ret void
+;
+ entry:
+
+ %0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7
+ %1 = extractvalue { x86_amx, x86_amx } %0, 0
+ %2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7
+ store <256 x i32> %2, ptr %m, align 1024
+
+ %3 = extractvalue { x86_amx, x86_amx } %0, 1
+ %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7
+ store <256 x i32> %4, ptr %m, align 1024
+
+ %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7
+ %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7
+ store <256 x i32> %6, ptr %m, align 64
+
+ %7 = load <256 x i32>, ptr %m, align 64
+ %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7
+ %9 = load <256 x i32>, ptr %m, align 64
+ %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7
+ %11 = load <256 x i32>, ptr %m, align 64
+ %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7
+
+ %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7
+ %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7
+ store <256 x i32> %14, ptr %m, align 64
+
+ ret void
+ }
+
+ ; Function Attrs: argmemonly nounwind readonly
+ declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2
+
+ ; Function Attrs: nounwind readnone
+ declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3
+
+ ; Function Attrs: nounwind
+ declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4
+
+ ; Function Attrs: nounwind
+ declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4
+
+ ; Function Attrs: nounwind readnone
+ declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3
+
+ ; Function Attrs: argmemonly nounwind writeonly
+ declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5
+
+ attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
+ attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #7 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
new file mode 100644
index 00000000000000..346d46b6b16c2e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+
+ @buf = dso_local global [2048 x i8] zeroinitializer, align 16
+ @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
+
+ ; Function Attrs: nounwind uwtable
+ define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
+; CHECK-LABEL: @test_tile_2rpntlvwz0(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
+; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]]
+; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]]
+; CHECK-NEXT: ret void
+;
+ entry:
+ %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5
+ %1 = extractvalue { x86_amx, x86_amx } %0, 0
+ %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5
+ %3 = extractvalue { x86_amx, x86_amx } %0, 1
+ %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5
+ %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
+ %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5
+ %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5
+ %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5
+ %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5
+ %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5
+ %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5
+ %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5
+ tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5
+ ret void
+ }
+
+ ; Function Attrs: argmemonly nounwind readonly
+ declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
+
+ ; Function Attrs: nounwind readnone
+ declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
+
+ ; Function Attrs: nounwind
+ declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
+
+ ; Function Attrs: nounwind
+ declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
+
+ ; Function Attrs: nounwind readnone
+ declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
+
+ ; Function Attrs: argmemonly nounwind writeonly
+ declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
+
+ attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
+ attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
+ attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
new file mode 100644
index 00000000000000..cdc525193fef71
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
+# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s
+
+---
+name: test_tile_2rpntlvwz0
+alignment: 16
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: gr64_nosp, preferred-register: '' }
+ - { id: 1, class: gr16, preferred-register: '' }
+ - { id: 2, class: gr16, preferred-register: '' }
+ - { id: 3, class: gr16, preferred-register: '' }
+ - { id: 4, class: gr64, preferred-register: '' }
+ - { id: 5, class: gr64, preferred-register: '' }
+ - { id: 6, class: gr64, preferred-register: '' }
+ - { id: 7, class: gr64_nosp, preferred-register: '' }
+ - { id: 8, class: tilepair, preferred-register: '' }
+ - { id: 9, class: tile, preferred-register: '' }
+ - { id: 10, class: tile, preferred-register: '' }
+ - { id: 11, class: tile, preferred-register: '' }
+ - { id: 181, class: tile, preferred-register: '' }
+ - { id: 183, class: tile, preferred-register: '' }
+ - { id: 185, class: tile, preferred-register: '' }
+ - { id: 186, class: tile, preferred-register: '' }
+liveins:
+ - { reg: '$edi', virtual-reg: '%0' }
+ - { reg: '$esi', virtual-reg: '%1' }
+ - { reg: '$edx', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1024
+ adjustsStack: false
+ hasCalls: true
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 21, name: '', type: default, offset: 0, size: 8,
+ alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ amxProgModel: ManagedRA
+body: |
+ bb.0.entry:
+ liveins: $rdi, $rsi, $rdx, $rax
+
+ ; CHECK-LABEL: name: test_tile_2rpntlvwz0
+ ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
+ ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4)
+ ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
+ ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64
+ ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64
+ ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16
+ ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16
+ ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax
+ ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
+ ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]]
+ ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]]
+ ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]]
+ ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]]
+ ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg
+ ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg
+ ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg
+ ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]]
+ ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
+ %0:gr64_nosp = MOV32ri64 64
+ %1:gr16 = MOV16ri 64
+ %2:gr16 = MOV16ri 16
+ %3:gr16 = MOV16ri 16
+ %4:gr64 = COPY $rsi
+ %5:gr64 = COPY $rdi
+ %6:gr64 = COPY $rdx
+ %7:gr64_nosp = COPY $rax
+ %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg
+ %9:tile = COPY %8.sub_t1
+ %10:tile = COPY %8.sub_t0
+ PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10
+ PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9
+ %11:tile = PTILEZEROV %1, %2
+ PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11
+ %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg
+ %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg
+ %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg
+ %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185
+ PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186
+...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
new file mode 100644
index 00000000000000..a9824dcac6b04a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
@@ -0,0 +1,113 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
+# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s
+
+---
+name: test_tile_2rpntlvwz0
+alignment: 16
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+ - { id: 0, class: gr32, preferred-register: '' }
+ - { id: 1, class: gr32, preferred-register: '' }
+ - { id: 2, class: gr32, preferred-register: '' }
+ - { id: 3, class: gr16, preferred-register: '' }
+ - { id: 4, class: gr16, preferred-register: '' }
+ - { id: 5, class: gr16, preferred-register: '' }
+ - { id: 6, class: gr64, preferred-register: '' }
+ - { id: 7, class: gr64_nosp, preferred-register: '' }
+ - { id: 8, class: tilepair, preferred-register: '' }
+ - { id: 9, class: tile, preferred-register: '' }
+ - { id: 10, class: tile, preferred-register: '' }
+ - { id: 11, class: tile, preferred-register: '' }
+ - { id: 12, class: tile, preferred-register: '' }
+ - { id: 13, class: gr64, preferred-register: '' }
+liveins:
+ - { reg: '$edi', virtual-reg: '%0' }
+ - { reg: '$esi', virtual-reg: '%1' }
+ - { reg: '$edx', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo:
+ amxProgModel: ManagedRA
+body: |
+ bb.0.entry:
+ liveins: $edi, $esi, $edx, $rax, $rbx
+
+ ; CHECK-LABEL: name: test_tile_2rpntlvwz0
+ ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
+ ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
+ ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit
+ ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4)
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax
+ ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32
+ ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
+ ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]]
+ ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]]
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx
+ ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
+ ; CHECK-NEXT: RET 0
+ %2:gr32 = COPY $edx
+ %1:gr32 = COPY $esi
+ %0:gr32 = COPY $edi
+ %3:gr16 = COPY %2.sub_16bit
+ %4:gr16 = COPY %1.sub_16bit
+ %5:gr16 = COPY %0.sub_16bit
+ %6:gr64 = COPY $rax
+ %7:gr64_nosp = MOV32ri64 32
+ %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg
+ %9:tile = COPY %8.sub_t1
+ %10:tile = COPY %8.sub_t0
+ %11:tile = PTILEZEROV %5, %4
+ %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9
+ %13:gr64 = COPY $rbx
+ PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12
+ RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
new file mode 100644
index 00000000000000..b06a9369b9762d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-tile,+amx-bf16,+amx-int8,+amx-transpose | FileCheck %s
+
+define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
+; CHECK-LABEL: test_amx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0
+; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2
+; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0
+; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2
+; CHECK-NEXT: ttransposed %tmm3, %tmm1
+; CHECK-NEXT: retq
+ call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
+ call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
+ call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
+ call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride)
+ call void @llvm.x86.ttransposed(i8 1, i8 3)
+ ret void
+}
+
+declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride)
+declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride)
+declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride)
+declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride)
+declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1)
+
+define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
+; CHECK-LABEL: test_amx3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: movw $8, %cx
+; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
+; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4
+; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4
+; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4
+; CHECK-NEXT: ttransposed %tmm4, %tmm0
+; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
+ %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
+ %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
+ %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
+ %5 = extractvalue { x86_amx, x86_amx } %4, 0
+ %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6)
+ ret void
+}
+
+define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
+; CHECK-LABEL: test_amx_spill:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, %ax
+; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0
+; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
+; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6
+; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
+; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
+; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6
+; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
+; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
+; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6
+; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
+; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill
+; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6
+; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
+; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
+; CHECK-NEXT: movabsq $64, %rcx
+; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
+; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
+; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
+; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
+; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
+; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
+; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx)
+; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx)
+; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
+ %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+ %e11 = extractvalue { x86_amx, x86_amx } %b1, 0
+ %e12 = extractvalue { x86_amx, x86_amx } %b1, 1
+ %e21 = extractvalue { x86_amx, x86_amx } %b2, 0
+ %e22 = extractvalue { x86_amx, x86_amx } %b2, 1
+ %e31 = extractvalue { x86_amx, x86_amx } %b3, 0
+ %e32 = extractvalue { x86_amx, x86_amx } %b3, 1
+ %e41 = extractvalue { x86_amx, x86_amx } %b4, 0
+ %e42 = extractvalue { x86_amx, x86_amx } %b4, 1
+ %e51 = extractvalue { x86_amx, x86_amx } %b5, 0
+ %e52 = extractvalue { x86_amx, x86_amx } %b5, 1
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52)
+ ret void
+}
+
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx)
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
index d1b8be15a2d034..9b123b730a214f 100644
--- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
@@ -3,7 +3,7 @@
target triple = "x86_64-unknown-unknown"
declare void @bar1()
define preserve_allcc void @foo()#0 {
-; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
call void @bar1()
call void @bar2()
ret void
@@ -11,7 +11,7 @@ define preserve_allcc void @foo()#0 {
declare void @bar2()
define preserve_nonecc void @foo2()#0 {
-; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
call void @bar1()
call void @bar2()
ret void
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.s b/llvm/test/MC/Disassembler/X86/amx-transpose-att.s
new file mode 100644
index 00000000000000..da3fa95ef6dd06
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.s
@@ -0,0 +1,57 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
+// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz0 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz0 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4
+// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm5
+
+// CHECK: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4
+// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm5
+
+// CHECK: t2rpntlvwz1 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz1 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm2
+// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0x94,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm3
+
+// CHECK: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
+
+// CHECK: ttransposed %tmm1, %tmm5
+// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
+ ttransposed %tmm1, %tmm5
+
+// CHECK: ttransposed %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
+ ttransposed %tmm2, %tmm3
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
new file mode 100644
index 00000000000000..e4f1689639ef9a
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
@@ -0,0 +1,58 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: t2rpntlvwz0 268435456(%rbp,%r14,8), %tmm4
+# INTEL: t2rpntlvwz0 tmm4, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x78,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz0 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32]
+0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: t2rpntlvwz0t1 268435456(%rbp,%r14,8), %tmm4
+# INTEL: t2rpntlvwz0t1 tmm4, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x78,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz0t1 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
+0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: t2rpntlvwz1 268435456(%rbp,%r14,8), %tmm4
+# INTEL: t2rpntlvwz1 tmm4, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x79,0x6e,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz1 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32]
+0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: t2rpntlvwz1t1 268435456(%rbp,%r14,8), %tmm4
+# INTEL: t2rpntlvwz1t1 tmm4, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x79,0x6f,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT: t2rpntlvwz1t1 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
+0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT: ttransposed %tmm1, %tmm2
+# INTEL: ttransposed tmm2, tmm1
+0xc4,0xe2,0x7a,0x5f,0xd1
+
+# ATT: ttransposed %tmm2, %tmm3
+# INTEL: ttransposed tmm3, tmm2
+0xc4,0xe2,0x7a,0x5f,0xda
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s b/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s
new file mode 100644
index 00000000000000..3b8dfaed313d61
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/amx-transpose-intel.s
@@ -0,0 +1,57 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x78,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x78,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz0 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x78,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0t1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x78,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz0t1 tmm7, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x78,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz0t1 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz0t1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1 tmm0, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x79,0x6e,0x84,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1 tmm1, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x79,0x6e,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x79,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x79,0x6f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+ t2rpntlvwz1t1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x79,0x6f,0x94,0x80,0x23,0x01,0x00,0x00]
+ t2rpntlvwz1t1 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+ t2rpntlvwz1t1 tmm2, [2*rbp - 32]
+
+// CHECK: ttransposed tmm5, tmm1
+// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
+ ttransposed tmm5, tmm1
+
+// CHECK: ttransposed tmm3, tmm2
+// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
+ ttransposed tmm3, tmm2
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 8f22d097681b1f..87f96ed28e326b 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -1113,7 +1113,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
// Create a stack location and ensure it's tracked.
SpillLoc SL = {getRegByName("RSP"), StackOffset::getFixed(-8)};
SpillLocationNo SpillNo = *MTracker->getOrTrackSpillLoc(SL);
- ASSERT_EQ(MTracker->getNumLocs(), 11u); // Tracks all possible stack locs.
+ ASSERT_EQ(MTracker->getNumLocs(), 13u); // Tracks all possible stack locs.
// Locations are: RSP, stack slots from 2^3 bits wide up to 2^9 for zmm regs,
// then slots for sub_8bit_hi and sub_16bit_hi ({8, 8} and {16, 16}).
// Finally, one for spilt fp80 registers.
@@ -1135,7 +1135,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
// There are other locations, for things like xmm0, which we're going to
// ignore here.
- auto [MInLocs, MOutLocs] = allocValueTables(4, 11);
+ auto [MInLocs, MOutLocs] = allocValueTables(4, 13);
// Transfer function: start with nothing.
SmallVector<MLocTransferMap, 1> TransferFunc;
@@ -1170,7 +1170,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
// function.
TransferFunc[1].insert({ALStackLoc, ALDefInBlk1});
TransferFunc[1].insert({HAXStackLoc, HAXDefInBlk1});
- initValueArray(MInLocs, 4, 11);
+ initValueArray(MInLocs, 4, 13);
placeMLocPHIs(*MF, AllBlocks, MInLocs, TransferFunc);
EXPECT_EQ(MInLocs[3][ALStackLoc.asU64()], ALPHI);
EXPECT_EQ(MInLocs[3][AXStackLoc.asU64()], AXPHI);
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 26b881651ea411..c6cd3da13646a6 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -1162,6 +1162,7 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
TYPE("vz512mem", TYPE_MVSIBZ)
TYPE("BNDR", TYPE_BNDR)
TYPE("TILE", TYPE_TMM)
+ TYPE("TILEPair", TYPE_TMM_PAIR)
errs() << "Unhandled type string " << s << "\n";
llvm_unreachable("Unhandled type string");
}
@@ -1243,6 +1244,7 @@ RecognizableInstr::rmRegisterEncodingFromString(const std::string &s,
ENCODING("VK64", ENCODING_RM)
ENCODING("BNDR", ENCODING_RM)
ENCODING("TILE", ENCODING_RM)
+ ENCODING("TILEPair", ENCODING_RM)
errs() << "Unhandled R/M register encoding " << s << "\n";
llvm_unreachable("Unhandled R/M register encoding");
}
@@ -1292,6 +1294,7 @@ RecognizableInstr::roRegisterEncodingFromString(const std::string &s,
ENCODING("VK64WM", ENCODING_REG)
ENCODING("BNDR", ENCODING_REG)
ENCODING("TILE", ENCODING_REG)
+ ENCODING("TILEPair", ENCODING_REG)
errs() << "Unhandled reg/opcode register encoding " << s << "\n";
llvm_unreachable("Unhandled reg/opcode register encoding");
}
@@ -1322,6 +1325,7 @@ RecognizableInstr::vvvvRegisterEncodingFromString(const std::string &s,
ENCODING("VK32", ENCODING_VVVV)
ENCODING("VK64", ENCODING_VVVV)
ENCODING("TILE", ENCODING_VVVV)
+ ENCODING("TILEPair", ENCODING_VVVV)
errs() << "Unhandled VEX.vvvv register encoding " << s << "\n";
llvm_unreachable("Unhandled VEX.vvvv register encoding");
}
>From f822950342402c03b0597b173f38f007c6fa3931 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Tue, 29 Oct 2024 22:04:30 +0800
Subject: [PATCH 2/3] Address review comment
---
llvm/include/llvm/CodeGen/TileShapeInfo.h | 2 +-
llvm/lib/Target/X86/X86ExpandPseudo.cpp | 2 +-
llvm/lib/Target/X86/X86FastPreTileConfig.cpp | 2 +-
llvm/lib/TargetParser/Host.cpp | 4 ++--
4 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h
index 0e0a883b0c5953..269d8e2964f4f7 100644
--- a/llvm/include/llvm/CodeGen/TileShapeInfo.h
+++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -36,7 +36,7 @@ class ShapeT {
}
// When ShapeT has mult shapes, we only use Shapes (never use Row and Col)
// and ImmShapes. Due to the most case is only one shape (just simply use
- // Shape.Row or Shape.Col), so here we don't merge Row and Col into vertor
+ // Shape.Row or Shape.Col), so here we don't merge Row and Col into vector
// Shapes to keep the speed and code simplicity.
// TODO: The upper solution is a temporary way to minimize current tile
// register allocation code changes. It can not handle both Reg shape and
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 5584c08a983037..f832955d1202fa 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -615,7 +615,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MBB.erase(MBBI);
return true;
}
- // Smilar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no
+ // Similar with TILEPAIRLOAD, TILEPAIRSTORE is just for TILEPair spill, no
// corresponding AMX instruction to support it. So, split it too:
// "TILEPAIRSTORE Base, Scale, Index, Offset, Segment, TMM0:TMM1" -->
// "TILESTORE Base, Scale, Index, Offset, Segment, TMM0" +
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index fd0987db5e4f55..62d0f6ca794348 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -268,7 +268,7 @@ void X86FastPreTileConfig::reload(MachineBasicBlock::iterator UseMI,
<< printReg(TileReg, TRI) << '\n');
}
-unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) {
+static unsigned getTileDefNum(MachineRegisterInfo *MRI, Register Reg) {
if (Reg.isVirtual()) {
unsigned RegClassID = MRI->getRegClass(Reg)->getID();
if (RegClassID == X86::TILERegClassID)
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index d48b5029dc740b..a9befee9577ec1 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1875,8 +1875,8 @@ const StringMap<bool> sys::getHostCPUFeatures() {
MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX);
Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1);
- bool HasLeaf1E =
- MaxLevel >= 0x1e && !getX86CpuIDAndInfo(0x1e, &EAX, &EBX, &ECX, &EDX);
+ bool HasLeaf1E = MaxLevel >= 0x1e &&
+ !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX);
Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
bool HasLeaf24 =
>From 5e762272c39984495a7bc9288f845811f10abd93 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Fri, 1 Nov 2024 15:10:08 +0800
Subject: [PATCH 3/3] Address review comments
---
clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
clang/lib/Headers/amxtransposeintrin.h | 6 +++---
llvm/include/llvm/CodeGen/TileShapeInfo.h | 6 +++---
llvm/lib/Target/X86/X86LowerAMXType.cpp | 10 +++++-----
4 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 879cf9fa8913ed..34fedd67114751 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17040,7 +17040,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
// Note: Here we escape directly use x86_tilestored64_internal to store
- // the results due to it can't make sure the Mem writen scope. This may
+ // the results due to it can't make sure the Mem written scope. This may
// cause shapes reloads after first amx intrinsic, which current amx reg-
// ister allocation has no ability to handle it.
diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
index d5dc68f4152848..b3fa37d766c45b 100644
--- a/clang/lib/Headers/amxtransposeintrin.h
+++ b/clang/lib/Headers/amxtransposeintrin.h
@@ -32,7 +32,7 @@
/// \headerfile <immintrin.h>
///
/// \code
-/// void __tile_transposed(__tile dst, __tile src);
+/// void _tile_transposed(__tile dst, __tile src);
/// \endcode
///
/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
@@ -40,7 +40,7 @@
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src
-/// The 1st source tile. Max size is 1024 Bytes.
+/// The source tile. Max size is 1024 Bytes.
///
/// \code{.operation}
///
@@ -238,7 +238,7 @@ static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src
-/// The 1st source tile. Max size is 1024 Bytes.
+/// The source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_TRANSPOSE
static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h
index 269d8e2964f4f7..24f303a7d9d137 100644
--- a/llvm/include/llvm/CodeGen/TileShapeInfo.h
+++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -34,14 +34,14 @@ class ShapeT {
if (MRI)
deduceImm(MRI);
}
- // When ShapeT has mult shapes, we only use Shapes (never use Row and Col)
+ // When ShapeT has multiple shapes, we only use Shapes (never use Row and Col)
// and ImmShapes. Due to the most case is only one shape (just simply use
// Shape.Row or Shape.Col), so here we don't merge Row and Col into vector
// Shapes to keep the speed and code simplicity.
// TODO: The upper solution is a temporary way to minimize current tile
// register allocation code changes. It can not handle both Reg shape and
// Imm shape for different shapes (e.g. shape 1 is reg shape while shape 2
- // is imm shape). Refine me when we have more mult-tile shape instructions!
+ // is imm shape). Refine me when we have more multi-tile shape instructions!
ShapeT(ArrayRef<MachineOperand *> ShapesOperands,
const MachineRegisterInfo *MRI = nullptr)
: Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
@@ -57,7 +57,7 @@ class ShapeT {
ShapeT()
: Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
ColImm(InvalidImmShape) {}
- // TODO: We need to extern cmp operator for muti-shapes if
+ // TODO: We need to extern cmp operator for multi-shapes if
// we have requirement in the future.
bool operator==(const ShapeT &Shape) const {
MachineOperand *R = Shape.Row;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index b9d811e91ab5a3..688e886cf3b13a 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -551,7 +551,7 @@ static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) {
assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!");
auto *II = dyn_cast<IntrinsicInst>(TileDef);
unsigned Idx = 0;
- // Extract tile from mult tiles' def.
+ // Extract tile from multiple tiles' def.
if (auto *Extr = dyn_cast<ExtractValueInst>(TileDef)) {
assert(Extr->hasIndices() && "Tile extract miss index!");
Idx = Extr->getIndices()[0];
@@ -584,7 +584,7 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) {
Value *PhiOp = cast<PHINode>(V)->getIncomingValue(0);
II = cast<IntrinsicInst>(PhiOp);
} else if (auto *Extr = dyn_cast<ExtractValueInst>(V)) {
- // Extract tile from mult tiles' def.
+ // Extract tile from multiple tiles' def.
assert(Extr->hasIndices() && "Tile extract miss index!");
Idx = Extr->getIndices()[0];
II = cast<IntrinsicInst>(Extr->getOperand(0));
@@ -1040,7 +1040,7 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
assert(Tile->getType()->isX86_AMXTy() && "Not Tile Operand!");
- // TODO: Specially handle the mult-use case.
+ // TODO: Specially handle the multi-use case.
if (Tile->getNumUses() != 1)
return false;
@@ -1057,8 +1057,8 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) {
Row = II->getOperand(0);
Col = II->getOperand(1);
} else {
- // Now we supported mult-tiles value in structure, so we may get tile
- // from extracting mult-tiles structure.
+ // Now we supported multi-tiles value in structure, so we may get tile
+ // from extracting multi-tiles structure.
// For example:
// %6 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %1,
// i16 %2, i16 %3, i8* %4, i64 %5)
More information about the cfe-commits
mailing list