[llvm-branch-commits] [clang] f80b298 - [X86] AMX programming model.
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Dec 10 01:11:29 PST 2020
Author: Luo, Yuanke
Date: 2020-12-10T17:01:54+08:00
New Revision: f80b29878b0448efb306b9c93c49e5deb9ba2738
URL: https://github.com/llvm/llvm-project/commit/f80b29878b0448efb306b9c93c49e5deb9ba2738
DIFF: https://github.com/llvm/llvm-project/commit/f80b29878b0448efb306b9c93c49e5deb9ba2738.diff
LOG: [X86] AMX programming model.
This patch implements amx programming model that discussed in llvm-dev
(http://lists.llvm.org/pipermail/llvm-dev/2020-August/144302.html).
Thank Hal for the good suggestion in the RA. The fast RA is not in the patch yet.
This patch implemeted 7 components.
1. The c interface to end user.
2. The AMX intrinsics in LLVM IR.
3. Transform load/store <256 x i32> to AMX intrinsics or split the
type into two <128 x i32>.
4. The Lowering from AMX intrinsics to AMX pseudo instruction.
5. Insert psuedo ldtilecfg and build the def-use between ldtilecfg to amx
intruction.
6. The register allocation for tile register.
7. Morph AMX pseudo instruction to AMX real instruction.
Change-Id: I935e1080916ffcb72af54c2c83faa8b2e97d5cb0
Differential Revision: https://reviews.llvm.org/D87981
Added:
clang/test/CodeGen/X86/amx_api.c
llvm/include/llvm/CodeGen/TileShapeInfo.h
llvm/lib/Target/X86/X86LowerAMXType.cpp
llvm/lib/Target/X86/X86PreTileConfig.cpp
llvm/lib/Target/X86/X86TileConfig.cpp
llvm/test/CodeGen/X86/AMX/amx-across-func.ll
llvm/test/CodeGen/X86/AMX/amx-config.ll
llvm/test/CodeGen/X86/AMX/amx-spill.ll
llvm/test/CodeGen/X86/AMX/amx-type.ll
Modified:
clang/include/clang/Basic/BuiltinsX86_64.def
clang/lib/Headers/amxintrin.h
llvm/include/llvm/CodeGen/LiveIntervalUnion.h
llvm/include/llvm/CodeGen/LiveRegMatrix.h
llvm/include/llvm/CodeGen/Passes.h
llvm/include/llvm/CodeGen/VirtRegMap.h
llvm/include/llvm/IR/Intrinsics.td
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/lib/CodeGen/InlineSpiller.cpp
llvm/lib/CodeGen/LiveIntervalUnion.cpp
llvm/lib/CodeGen/LiveRegMatrix.cpp
llvm/lib/CodeGen/VirtRegMap.cpp
llvm/lib/Target/X86/CMakeLists.txt
llvm/lib/Target/X86/X86.h
llvm/lib/Target/X86/X86ExpandPseudo.cpp
llvm/lib/Target/X86/X86FrameLowering.cpp
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrAMX.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86RegisterInfo.cpp
llvm/lib/Target/X86/X86RegisterInfo.h
llvm/lib/Target/X86/X86RegisterInfo.td
llvm/lib/Target/X86/X86Subtarget.h
llvm/lib/Target/X86/X86TargetMachine.cpp
llvm/test/CodeGen/X86/O0-pipeline.ll
llvm/test/CodeGen/X86/ipra-reg-usage.ll
llvm/test/CodeGen/X86/opt-pipeline.ll
llvm/test/CodeGen/X86/statepoint-fixup-invoke.mir
llvm/test/CodeGen/X86/statepoint-fixup-shared-ehpad.mir
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 3e186af82ff7..98327ade17e8 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -100,6 +100,10 @@ TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr")
TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")
+// AMX internal builtin
+TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index 58254e21c81a..03a468ef15b1 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -15,8 +15,8 @@
#define __AMXINTRIN_H
#ifdef __x86_64__
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
+#define __DEFAULT_FN_ATTRS_TILE \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
@@ -31,9 +31,8 @@
///
/// \param __config
/// A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_loadconfig(const void *__config)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_loadconfig(const void *__config) {
__builtin_ia32_tile_loadconfig(__config);
}
@@ -48,9 +47,8 @@ _tile_loadconfig(const void *__config)
///
/// \param __config
/// A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_storeconfig(void *__config)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_storeconfig(void *__config) {
__builtin_ia32_tile_storeconfig(__config);
}
@@ -60,9 +58,7 @@ _tile_storeconfig(void *__config)
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_release(void)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
__builtin_ia32_tilerelease();
}
@@ -80,8 +76,9 @@ _tile_release(void)
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
-#define _tile_loadd(dst, base, stride) \
- __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+#define _tile_loadd(dst, base, stride) \
+ __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
+ (__SIZE_TYPE__)(stride))
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst" using the tile configuration previously configured
@@ -99,8 +96,9 @@ _tile_release(void)
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
-#define _tile_stream_loadd(dst, base, stride) \
- __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+#define _tile_stream_loadd(dst, base, stride) \
+ __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
+ (__SIZE_TYPE__)(stride))
/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride" using the tile configuration previously configured via
@@ -116,7 +114,7 @@ _tile_release(void)
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be stored in memory.
-#define _tile_stored(dst, base, stride) \
+#define _tile_stored(dst, base, stride) \
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
/// Zero the tile specified by "tdest".
@@ -145,7 +143,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
+#define _tile_dpbssd(dst, src0, src1) \
+ __builtin_ia32_tdpbssd((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
@@ -163,7 +162,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
+#define _tile_dpbsud(dst, src0, src1) \
+ __builtin_ia32_tdpbsud((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@@ -181,7 +181,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
+#define _tile_dpbusd(dst, src0, src1) \
+ __builtin_ia32_tdpbusd((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@@ -199,7 +200,8 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
+#define _tile_dpbuud(dst, src0, src1) \
+ __builtin_ia32_tdpbuud((dst), (src0), (src1))
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
@@ -216,10 +218,56 @@ _tile_release(void)
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf16ps(dst, src0, src1) \
+#define _tile_dpbf16ps(dst, src0, src1) \
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
-#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_INT8 \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
+
+typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
+ __SIZE_TYPE__ stride) {
+ return __builtin_ia32_tileloadd64_internal(m, n, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_INT8
+_tile_stored_internal(unsigned short m, unsigned short n, void *base,
+ __SIZE_TYPE__ stride, _tile1024i tile) {
+ return __builtin_ia32_tilestored64_internal(m, n, base,
+ (__SIZE_TYPE__)(stride), tile);
+}
+
+typedef struct __tile1024i_str {
+ const unsigned short row;
+ const unsigned short col;
+ _tile1024i tile;
+} __tile1024i;
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_loadd(__tile1024i *dst, const void *base,
+ __SIZE_TYPE__ stride) {
+ dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1,
+ __tile1024i src2) {
+ dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
+ src1.tile, src2.tile);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
+ _tile_stored_internal(src.row, src.col, base, stride, src.tile);
+}
#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */
diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c
new file mode 100644
index 000000000000..c7fab8c6ed41
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_api.c
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f -target-feature +amx-int8 \
+// RUN: -target-feature +amx-bf16 -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
+
+#include <immintrin.h>
+
+char buf[1024];
+#define STRIDE 32
+
+char buf2[1024];
+
+// This is an example code and integration test.
+void test_api(int cond, short row, short col) {
+ //CHECK-LABEL: @test_api
+ //CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
+ //CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
+ //CHECK: call void @llvm.x86.tilestored64.internal
+ __tile1024i a = {row, 8};
+ __tile1024i b = {8, col};
+ __tile1024i c = {row, col};
+
+ if (cond) {
+ __tile_loadd(&a, buf, STRIDE);
+ __tile_loadd(&b, buf, STRIDE);
+ __tile_loadd(&c, buf, STRIDE);
+ } else {
+ __tile_loadd(&a, buf2, STRIDE);
+ __tile_loadd(&b, buf2, STRIDE);
+ __tile_loadd(&c, buf2, STRIDE);
+ }
+ __tile_dpbsud(&c, a, b);
+ __tile_stored(buf, STRIDE, c);
+}
+
+void test_tile_loadd(short row, short col) {
+ //CHECK-LABEL: @test_tile_loadd
+ //CHECK: call <256 x i32> @llvm.x86.tileloadd64.internal
+ __tile1024i a = {row, col};
+ __tile_loadd(&a, buf, STRIDE);
+}
+
+void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) {
+ //CHECK-LABEL: @test_tile_dpbsud
+ //CHECK: call <256 x i32> @llvm.x86.tdpbssd.internal
+ __tile_dpbsud(&c, a, b);
+}
+
+void test_tile_stored(__tile1024i c) {
+ //CHECK-LABEL: @test_tile_stored
+ //CHECK: call void @llvm.x86.tilestored64.internal
+ __tile_stored(buf, STRIDE, c);
+}
diff --git a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
index c555763a4ec2..ad9e06d2bcf0 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -104,6 +104,9 @@ class LiveIntervalUnion {
void verify(LiveVirtRegBitSet& VisitedVRegs);
#endif
+ // Get any virtual register that is assign to this physical unit
+ LiveInterval *getOneVReg() const;
+
/// Query interferences between a single live virtual register and a live
/// interval union.
class Query {
diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index a3f8f88e810b..fc67bce329ab 100644
--- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -153,6 +153,8 @@ class LiveRegMatrix : public MachineFunctionPass {
/// Directly access the live interval unions per regunit.
/// This returns an array indexed by the regunit number.
LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; }
+
+ Register getOneVReg(unsigned PhysReg) const;
};
} // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index a74334e6200c..47037bac6270 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -485,6 +485,10 @@ namespace llvm {
/// The pass fixups statepoint machine instruction to replace usage of
/// caller saved registers with stack slots.
extern char &FixupStatepointCallerSavedID;
+
+ /// The pass transform load/store <256 x i32> to AMX load/store intrinsics
+ /// or split the data to two <128 x i32>.
+ FunctionPass *createX86LowerAMXTypePass();
} // End llvm namespace
#endif
diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h
new file mode 100644
index 000000000000..f7ad81c25ebb
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h
@@ -0,0 +1,107 @@
+//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Shape utility for AMX.
+/// AMX hardware requires to config the shape of tile data register before use.
+/// The 2D shape includes row and column. In AMX intrinsics interface the shape
+/// is passed as 1st and 2nd parameter and they are lowered as the 1st and 2nd
+/// machine operand of AMX pseudo instructions. ShapeT class is to facilitate
+/// tile config and register allocator. The row and column are machine operand
+/// of AMX pseudo instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_TILESHAPEINFO_H
+#define LLVM_CODEGEN_TILESHAPEINFO_H
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include <utility>
+
+using namespace llvm;
+
+namespace llvm {
+
+class ShapeT {
+public:
+ ShapeT(MachineOperand *Row, MachineOperand *Col,
+ const MachineRegisterInfo *MRI = nullptr)
+ : Row(Row), Col(Col) {
+ if (MRI)
+ deduceImm(MRI);
+ }
+ ShapeT()
+ : Row(nullptr), Col(nullptr), RowImm(InvalidImmShape),
+ ColImm(InvalidImmShape) {}
+ bool operator==(const ShapeT &Shape) {
+ MachineOperand *R = Shape.Row;
+ MachineOperand *C = Shape.Col;
+ if (!R || !C)
+ return false;
+ if (!Row || !Col)
+ return false;
+ if (Row->getReg() == R->getReg() && Col->getReg() == C->getReg())
+ return true;
+ if ((RowImm != InvalidImmShape) && (ColImm != InvalidImmShape))
+ return RowImm == Shape.getRowImm() && ColImm == Shape.getColImm();
+ return false;
+ }
+
+ bool operator!=(const ShapeT &Shape) { return !(*this == Shape); }
+
+ ShapeT &operator=(const ShapeT &RHS) {
+ Row = RHS.Row;
+ Col = RHS.Col;
+ RowImm = RHS.RowImm;
+ ColImm = RHS.ColImm;
+ return *this;
+ }
+
+ MachineOperand *getRow() const { return Row; }
+
+ MachineOperand *getCol() const { return Col; }
+
+ int64_t getRowImm() const { return RowImm; }
+
+ int64_t getColImm() const { return ColImm; }
+
+ bool isValid() { return (Row != nullptr) && (Col != nullptr); }
+
+ void deduceImm(const MachineRegisterInfo *MRI) {
+ // All def must be the same value, otherwise it is invalid MIs.
+ // Find the immediate.
+ // TODO copy propagation.
+ auto GetImm = [&](Register Reg) {
+ int64_t Imm = InvalidImmShape;
+ for (const MachineOperand &DefMO : MRI->def_operands(Reg)) {
+ const auto *MI = DefMO.getParent();
+ if (MI->isMoveImmediate()) {
+ Imm = MI->getOperand(1).getImm();
+ break;
+ }
+ }
+ return Imm;
+ };
+ RowImm = GetImm(Row->getReg());
+ ColImm = GetImm(Col->getReg());
+ }
+
+private:
+ static constexpr int64_t InvalidImmShape = -1;
+ MachineOperand *Row;
+ MachineOperand *Col;
+ int64_t RowImm;
+ int64_t ColImm;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/VirtRegMap.h b/llvm/include/llvm/CodeGen/VirtRegMap.h
index cb6bbd32f434..1775afb51bdf 100644
--- a/llvm/include/llvm/CodeGen/VirtRegMap.h
+++ b/llvm/include/llvm/CodeGen/VirtRegMap.h
@@ -19,6 +19,7 @@
#include "llvm/ADT/IndexedMap.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
#include "llvm/Pass.h"
#include <cassert>
@@ -60,6 +61,10 @@ class TargetInstrInfo;
/// mapping.
IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2SplitMap;
+ /// Virt2ShapeMap - For X86 AMX register whose register is bound shape
+ /// information.
+ DenseMap<unsigned, ShapeT> Virt2ShapeMap;
+
/// createSpillSlot - Allocate a spill slot for RC from MFI.
unsigned createSpillSlot(const TargetRegisterClass *RC);
@@ -107,6 +112,21 @@ class TargetInstrInfo;
/// the specified physical register
void assignVirt2Phys(Register virtReg, MCPhysReg physReg);
+ bool isShapeMapEmpty() const { return Virt2ShapeMap.empty(); }
+
+ bool hasShape(Register virtReg) const {
+ return getShape(virtReg).isValid();
+ }
+
+ ShapeT getShape(Register virtReg) const {
+ assert(virtReg.isVirtual());
+ return Virt2ShapeMap.lookup(virtReg);
+ }
+
+ void assignVirt2Shape(Register virtReg, ShapeT shape) {
+ Virt2ShapeMap[virtReg.id()] = shape;
+ }
+
/// clears the specified virtual register's, physical
/// register mapping
void clearVirt(Register virtReg) {
@@ -133,6 +153,9 @@ class TargetInstrInfo;
/// records virtReg is a split live interval from SReg.
void setIsSplitFromReg(Register virtReg, unsigned SReg) {
Virt2SplitMap[virtReg.id()] = SReg;
+ if (hasShape(SReg)) {
+ Virt2ShapeMap[virtReg.id()] = getShape(SReg);
+ }
}
/// returns the live interval virtReg is split from.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index eb6c408b4f85..331434bd212d 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -292,6 +292,7 @@ def llvm_v8i32_ty : LLVMType<v8i32>; // 8 x i32
def llvm_v16i32_ty : LLVMType<v16i32>; // 16 x i32
def llvm_v32i32_ty : LLVMType<v32i32>; // 32 x i32
def llvm_v64i32_ty : LLVMType<v64i32>; // 64 x i32
+def llvm_v256i32_ty : LLVMType<v256i32>; //256 x i32
def llvm_v1i64_ty : LLVMType<v1i64>; // 1 x i64
def llvm_v2i64_ty : LLVMType<v2i64>; // 2 x i64
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index d426d866f576..d0503d15824a 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5056,4 +5056,20 @@ let TargetPrefix = "x86" in {
Intrinsic<[llvm_i8_ty], [], []>;
def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">,
Intrinsic<[], [llvm_i64_ty], []>;
+// AMX - internal intrinsics
+ def int_x86_tileloadd64_internal :
+ GCCBuiltin<"__builtin_ia32_tileloadd64_internal">,
+ Intrinsic<[llvm_v256i32_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
+ def int_x86_tdpbssd_internal :
+ GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
+ Intrinsic<[llvm_v256i32_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+ llvm_v256i32_ty, llvm_v256i32_ty,
+ llvm_v256i32_ty], []>;
+ def int_x86_tilestored64_internal :
+ GCCBuiltin<"__builtin_ia32_tilestored64_internal">,
+ Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,
+ llvm_i64_ty, llvm_v256i32_ty], []>;
}
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 975d51c4cd13..02f7721c219a 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -1558,4 +1558,6 @@ void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) {
VRM.assignVirt2StackSlot(New, VRM.getStackSlot(Old));
else
llvm_unreachable("VReg should be assigned either physreg or stackslot");
+ if (VRM.hasShape(Old))
+ VRM.assignVirt2Shape(New, VRM.getShape(Old));
}
diff --git a/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/llvm/lib/CodeGen/LiveIntervalUnion.cpp
index cccc14e4e8a4..7ccb8df4bc05 100644
--- a/llvm/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -99,6 +99,16 @@ void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
}
#endif //!NDEBUG
+LiveInterval *LiveIntervalUnion::getOneVReg() const {
+ if (empty())
+ return nullptr;
+ for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
+ // return the first valid live interval
+ return SI.value();
+ }
+ return nullptr;
+}
+
// Scan the vector of interfering virtual registers in this union. Assume it's
// quite small.
bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index 59c7f93fd915..a69aa6557e46 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -221,3 +221,13 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
}
return false;
}
+
+Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const {
+ LiveInterval *VRegInterval = nullptr;
+ for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) {
+ if ((VRegInterval = Matrix[*Unit].getOneVReg()))
+ return VRegInterval->reg();
+ }
+
+ return MCRegister::NoRegister;
+}
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index f1401b307397..5e0ff9d9092c 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -68,6 +68,7 @@ bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
Virt2PhysMap.clear();
Virt2StackSlotMap.clear();
Virt2SplitMap.clear();
+ Virt2ShapeMap.clear();
grow();
return false;
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 8c7fee59789a..5a5002670296 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -32,6 +32,9 @@ set(sources
X86CmovConversion.cpp
X86DomainReassignment.cpp
X86DiscriminateMemOps.cpp
+ X86LowerAMXType.cpp
+ X86TileConfig.cpp
+ X86PreTileConfig.cpp
X86ExpandPseudo.cpp
X86FastISel.cpp
X86FixupBWInsts.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index d4ad10d79bab..e17b9ba5500b 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -76,6 +76,10 @@ FunctionPass *createX86FlagsCopyLoweringPass();
/// Return a pass that expands WinAlloca pseudo-instructions.
FunctionPass *createX86WinAllocaExpander();
+FunctionPass *createX86TileConfigPass();
+
+FunctionPass *createX86PreTileConfigPass();
+
/// Return a pass that inserts int3 at the end of the function if it ends with a
/// CALL instruction. The pass does the same for each funclet as well. This
/// ensures that the open interval of function start and end PCs contains all
@@ -162,6 +166,9 @@ void initializeX86OptimizeLEAPassPass(PassRegistry &);
void initializeX86PartialReductionPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
+void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86TileConfigPass(PassRegistry &);
+void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
namespace X86AS {
enum : unsigned {
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index b1d15225eaaf..a2fe09aecc49 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -461,6 +461,39 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
+ case X86::PLDTILECFG: {
+ MI.RemoveOperand(0);
+ MI.setDesc(TII->get(X86::LDTILECFG));
+ return true;
+ }
+ case X86::PSTTILECFG: {
+ MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
+ MI.setDesc(TII->get(X86::STTILECFG));
+ return true;
+ }
+ case X86::PTILELOADDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
+ for (unsigned i = 2; i > 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILELOADD));
+ return true;
+ }
+ case X86::PTDPBSSDV: {
+ MI.RemoveOperand(7); // Remove $tmmcfg
+ MI.untieRegOperand(4);
+ for (unsigned i = 3; i > 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TDPBSSD));
+ MI.tieOperands(0, 1);
+ return true;
+ }
+ case X86::PTILESTOREDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
+ for (int i = 1; i >= 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILESTORED));
+ return true;
+ }
}
llvm_unreachable("Previous switch has a fallthrough?");
}
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 0f9fcb77727f..735c08791b89 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2091,6 +2091,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
}
}
+
+ // Emit tilerelease for AMX kernel.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!MRI.reg_nodbg_empty(X86::TMMCFG))
+ BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
}
StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index a5cb078b2257..e0f54a2f4c1f 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4572,6 +4572,49 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, Res);
return;
}
+ case Intrinsic::x86_tileloadd64_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc = X86::PTILELOADDV;
+ // _tile_loadd_internal(row, col, buf, STRIDE)
+ SDValue Base = Node->getOperand(4);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(5);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Base,
+ Scale,
+ Index,
+ Disp,
+ Segment,
+ CFG,
+ Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, {MVT::v256i32, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ case Intrinsic::x86_tdpbssd_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc = X86::PTDPBSSDV;
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Node->getOperand(4),
+ Node->getOperand(5),
+ Node->getOperand(6),
+ Node->getOperand(7),
+ CFG};
+ MachineSDNode *CNode =
+ CurDAG->getMachineNode(Opc, dl, {MVT::v256i32, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
}
break;
}
@@ -4629,6 +4672,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
+ case Intrinsic::x86_tilestored64_internal: {
+ unsigned Opc = X86::PTILESTOREDV;
+ // _tile_stored_internal(row, col, buf, STRIDE, c)
+ SDValue Base = Node->getOperand(4);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(5);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Base,
+ Scale,
+ Index,
+ Disp,
+ Segment,
+ Node->getOperand(6),
+ CFG,
+ Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
case Intrinsic::x86_tileloadd64:
case Intrinsic::x86_tileloaddt164:
case Intrinsic::x86_tilestored64: {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5a77cc1f17fc..849f5a06db61 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1897,6 +1897,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}
+ if (Subtarget.hasAMXTILE()) {
+ addRegisterClass(MVT::v256i32, &X86::TILERegClass);
+ }
+
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -5341,6 +5345,12 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
// width.
if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
return false;
+
+ // Don't merge to x86 amx tile, as we only map MVT::v256i32
+ // to x86 amx tile on amx intrinsics.
+ if (MemVT == MVT::v256i32)
+ return false;
+
return true;
}
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index b315287afb76..e7346261b40c 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -16,17 +16,21 @@
let Predicates = [HasAMXTILE, In64BitMode] in {
let SchedRW = [WriteSystem] in {
- let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ let hasSideEffects = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"ldtilecfg\t$src",
[(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
+ let hasSideEffects = 1 in
def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"sttilecfg\t$src",
[(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
+ let mayLoad = 1 in
def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src),
"tileloadd\t{$src, $dst|$dst, $src}", []>,
VEX, T8XD;
+ let mayLoad = 1 in
def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src),
"tileloaddt1\t{$src, $dst|$dst, $src}", []>,
@@ -34,6 +38,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
"tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
+ let mayStore = 1 in
def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
(ins sibmem:$dst, TILE:$src),
"tilestored\t{$src, $dst|$dst, $src}", []>,
@@ -42,6 +47,22 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
"tilezero\t$dst", []>,
VEX, T8XD;
+ // Pseduo instruction for RA.
+ let hasSideEffects = 1, mayLoad = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
+
+ let hasSideEffects = 1, mayStore = 1 in
+ def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
+
+ def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3,
+ TILECFG:$cfg), []>;
+ def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
+ GR16:$src2, opaquemem:$src3,
+ TILE:$src4, TILECFG:$cfg), []>;
+
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
@@ -76,6 +97,12 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
VEX_4V, T8PS;
}
+ // Pseduo instruction for RA.
+ let Constraints = "$src4 = $dst" in
+ def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
+
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index ce34de3a11d0..57eac4080cbb 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3796,13 +3796,31 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const MachineFunction &MF = *MBB.getParent();
assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
"Stack slot too small for store");
- unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
- bool isAligned =
- (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
- RI.canRealignStack(MF);
- unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
- .addReg(SrcReg, getKillRegState(isKill));
+ if (RC->getID() == X86::TILERegClassID) {
+ unsigned Opc = X86::TILESTORED;
+ // tilestored %tmm, (%sp, %idx)
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+ MachineInstr *NewMI =
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ MachineOperand &MO = NewMI->getOperand(2);
+ MO.setReg(VirtReg);
+ MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PSTTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ }
}
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -3810,13 +3828,32 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Register DestReg, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- const MachineFunction &MF = *MBB.getParent();
- unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
- bool isAligned =
- (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
- RI.canRealignStack(MF);
- unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
+ if (RC->getID() == X86::TILERegClassID) {
+ unsigned Opc = X86::TILELOADD;
+ // tileloadd (%sp, %idx), %tmm
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ MachineInstr *NewMI =
+ BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+ NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ MachineOperand &MO = NewMI->getOperand(3);
+ MO.setReg(VirtReg);
+ MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PLDTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ } else {
+ const MachineFunction &MF = *MBB.getParent();
+ unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ }
}
bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
new file mode 100644
index 000000000000..ffbcace6377e
--- /dev/null
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -0,0 +1,294 @@
+//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to transform <256 x i32>
+/// <256 x i32> is mapped to AMX tile register on X86, AMX instruction set only
+/// provides simple operation on tile register. The basic elementwise operation
+/// is not supported by AMX. Since we define the AMX tile as vector <256 x i32>
+/// and only AMX intrinsics can operate on the type, we need transform
+/// load/store <256 x i32> instruction to AMX load/store. Besides, we split
+/// <256 x i32> to 2 <128 x i32> if the vector is not used or defined by AMX
+/// intrinsics, so that in instruction selection it can be lowered to proper
+/// size which HW can support.
+//
+//===----------------------------------------------------------------------===//
+//
+#include "X86.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-amx-type"
+
+namespace {
+class X86LowerAMXType {
+ Function &Func;
+ const DataLayout &DL;
+ DenseSet<Instruction *> LDSet;
+ DenseSet<Instruction *> STSet;
+ DenseMap<Value *, std::pair<LoadInst *, LoadInst *>> LoadMap;
+
+public:
+ X86LowerAMXType(Function &F) : Func(F), DL(F.getParent()->getDataLayout()) {}
+ bool visit();
+ bool visitLD();
+ bool visitST();
+ void splitST(Instruction *Inst);
+ void splitLD(Instruction *Inst);
+};
+
+// Split v256i32 load/store to 2 v128i32, so that ISel can
+// lower it to proper vector size.
+void X86LowerAMXType::splitST(Instruction *Inst) {
+ StoreInst *ST = dyn_cast<StoreInst>(Inst);
+ IRBuilder<> Builder(ST);
+ LLVMContext &Ctx = Builder.getContext();
+ Type *Ty = ST->getValueOperand()->getType();
+ EVT VT = EVT::getEVT(Ty);
+ EVT HalfVT = VT.getHalfNumVectorElementsVT(Ctx);
+ Type *HalfTy = HalfVT.getTypeForEVT(Ctx);
+
+ LoadInst *Lo, *Hi;
+ std::tie(Lo, Hi) = LoadMap[ST->getValueOperand()];
+ Value *Ptr = ST->getPointerOperand();
+ PointerType *HalfPtrTy = HalfTy->getPointerTo(ST->getPointerAddressSpace());
+ Value *HalfPtr = Builder.CreateBitCast(Ptr, HalfPtrTy);
+ // The HW require the alignment for AMX tile is 64, but front-end generate
+ // code for the vector alignment which is the vector size.
+ uint64_t HalfTySize = HalfTy->getPrimitiveSizeInBits().getFixedSize() / 8;
+ Align Alignment = std::min(Lo->getAlign(), Align(HalfTySize));
+ Builder.CreateAlignedStore(Lo, HalfPtr, Alignment, ST->isVolatile());
+
+ HalfPtr = Builder.CreateGEP(HalfTy, HalfPtr, Builder.getInt32(1));
+ Builder.CreateAlignedStore(Hi, HalfPtr, Alignment, ST->isVolatile());
+}
+
+bool X86LowerAMXType::visitST() {
+ if (STSet.empty())
+ return false;
+ for (auto *Inst : STSet) {
+ Value *Row, *Col;
+ const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst->getOperand(0));
+ if (!II)
+ Row = Col = nullptr;
+ else {
+ switch (II->getIntrinsicID()) {
+ default:
+ Row = Col = nullptr;
+ break;
+ case Intrinsic::x86_tileloadd64_internal:
+ case Intrinsic::x86_tdpbssd_internal: {
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ }
+ }
+ if (!Row) {
+ splitST(Inst);
+ continue;
+ }
+ IRBuilder<> Builder(Inst);
+ LLVMContext &Ctx = Builder.getContext();
+ // Use the maximun column as stride. It must be the same with load stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(Inst->getOperand(1), Type::getInt8PtrTy(Ctx));
+ std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride,
+ Inst->getOperand(0)};
+
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+ }
+ return true;
+}
+
+void X86LowerAMXType::splitLD(Instruction *Inst) {
+ LoadInst *LD = dyn_cast<LoadInst>(Inst);
+ IRBuilder<> Builder(LD);
+ LLVMContext &Ctx = Builder.getContext();
+ Type *Ty = LD->getType();
+ EVT VT = EVT::getEVT(Ty);
+ EVT HalfVT = VT.getHalfNumVectorElementsVT(Ctx);
+ Type *HalfTy = HalfVT.getTypeForEVT(Ctx);
+
+ Value *Ptr = LD->getPointerOperand();
+ PointerType *HalfPtrTy = HalfTy->getPointerTo(LD->getPointerAddressSpace());
+ Value *HalfPtr = Builder.CreateBitCast(Ptr, HalfPtrTy);
+ // The HW require the alignment for AMX tile is 64, but front-end generate
+ // code for the vector alignment which is the vector size.
+ uint64_t HalfTySize = HalfTy->getPrimitiveSizeInBits().getFixedSize() / 8;
+ Align Alignment = std::min(LD->getAlign(), Align(HalfTySize));
+ auto *Lo =
+ Builder.CreateAlignedLoad(HalfTy, HalfPtr, Alignment, LD->isVolatile());
+
+ HalfPtr = Builder.CreateGEP(HalfTy, HalfPtr, Builder.getInt32(1));
+ auto *Hi =
+ Builder.CreateAlignedLoad(HalfTy, HalfPtr, Alignment, LD->isVolatile());
+
+ LoadMap[Inst] = std::make_pair(Lo, Hi);
+}
+
+bool X86LowerAMXType::visitLD() {
+ if (LDSet.empty())
+ return false;
+ for (auto &Inst : LDSet) {
+ int Count = 0;
+ Value *NewInst = nullptr;
+ // The user should be all AMX intrinsics or all LLVM instruction.
+ // Don't support it is used by both AMX intrinsics and LLVM instructions.
+ for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) {
+ Use &U = *I++;
+ const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U.getUser());
+ if (!II) {
+ Count++;
+ continue;
+ }
+ if (NewInst)
+ continue;
+ Value *Row, *Col;
+ switch (II->getIntrinsicID()) {
+ default:
+ report_fatal_error("Non-AMX intrinsic use tile type.");
+ break;
+ case Intrinsic::x86_tdpbssd_internal: {
+ unsigned OpNo = U.getOperandNo();
+ switch (OpNo) {
+ case 3:
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ case 4:
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(2);
+ break;
+ case 5:
+ Row = II->getArgOperand(2);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ break;
+ }
+ case Intrinsic::x86_tilestored64_internal: {
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ }
+ assert(Count == 0 && "Can NOT mix amx intrinsic and LLVM instruction");
+ // FIXME: The shape def should be ahead of load.
+ IRBuilder<> Builder(Inst);
+ LLVMContext &Ctx = Builder.getContext();
+ // Use the maximun column as stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(Inst->getOperand(0), Type::getInt8PtrTy(Ctx));
+ std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+
+ NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal,
+ None, Args);
+
+ Inst->replaceAllUsesWith(NewInst);
+ }
+ if (!NewInst)
+ splitLD(Inst);
+ }
+ return true;
+}
+
+bool X86LowerAMXType::visit() {
+ bool C;
+ auto IsAMXType = [](FixedVectorType *VTy) {
+ if (!VTy)
+ return false;
+ if (!VTy->getScalarType()->isIntegerTy(32))
+ return false;
+ if (VTy->getNumElements() != 256)
+ return false;
+
+ return true;
+ };
+
+ for (BasicBlock &BB : Func) {
+ for (Instruction &Inst : BB) {
+ LoadInst *LD = dyn_cast<LoadInst>(&Inst);
+ // Check load instruction.
+ // %3 = load <256 x i32>, <256 x i32>* %1, align 64
+ if (LD) {
+ FixedVectorType *VTy = dyn_cast<FixedVectorType>(Inst.getType());
+ if (!IsAMXType(VTy))
+ continue;
+ LDSet.insert(&Inst);
+ continue;
+ }
+ // Check store instruction.
+ // store <256 x i32> %3, <256 x i32>* %2, align 64
+ StoreInst *ST = dyn_cast<StoreInst>(&Inst);
+ if (!ST)
+ continue;
+ FixedVectorType *VTy =
+ dyn_cast<FixedVectorType>(ST->getOperand(0)->getType());
+ if (!IsAMXType(VTy))
+ continue;
+ STSet.insert(&Inst);
+ }
+ }
+
+ C = visitLD() | visitST();
+ for (auto *Inst : STSet)
+ Inst->eraseFromParent();
+ for (auto *Inst : LDSet)
+ Inst->eraseFromParent();
+ return C;
+}
+} // anonymous namespace
+
+namespace {
+
+class X86LowerAMXTypeLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
+ initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ X86LowerAMXType LAT(F);
+ bool C = LAT.visit();
+ return C;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+};
+
+} // anonymous namespace
+
+static const char PassName[] = "Lower AMX type for load/store";
+char X86LowerAMXTypeLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+ false)
+INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+ false)
+
+FunctionPass *llvm::createX86LowerAMXTypePass() {
+ return new X86LowerAMXTypeLegacyPass();
+}
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
new file mode 100644
index 000000000000..50719744f238
--- /dev/null
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -0,0 +1,263 @@
+//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to pre-config the shape of AMX register
+/// AMX register need to be configured before use. The shape of AMX register
+/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
+/// The pldtilecfg is to config tile registers. It should dominator all AMX
+/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
+/// register is used by all AMX instructions.
+/// This pass is to find the common dominator of all AMX instructions and
+/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
+/// produces is inserted as the last operand of each AMX instruction. We use
+/// this scheme to model the def-use relationship between AMX config instruction
+/// and other AMX instructions. Below is an example.
+///
+/// ----B1----
+/// / \
+/// / \
+/// B2 B3
+/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV
+///
+/// is transformed to
+///
+/// B1
+/// %25:tilecfg = PLDTILECFG
+/// / \
+/// / \
+/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-pre-config"
+
+namespace {
+
+class X86PreTileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+
+public:
+ X86PreTileConfig() : MachineFunctionPass(ID) {}
+
+ /// Return the pass name.
+ StringRef getPassName() const override {
+ return "Tile Register Pre-configure";
+ }
+
+ /// X86PreTileConfig analysis usage.
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &mf) override;
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86PreTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
+ "Tile Register Configure", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
+ "Tile Register Configure", false, false)
+
+void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
+ const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI,
+ const X86Subtarget *ST) {
+ auto *MBB = MI->getParent();
+
+ // FIXME: AMX should assume AVX512 enabled.
+ if (ST->hasAVX512()) {
+ // Zero stack slot.
+ Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
+ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
+ .addReg(Zmm, RegState::Undef)
+ .addReg(Zmm, RegState::Undef);
+ addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
+ FrameIdx)
+ .addReg(Zmm);
+ }
+
+ // build psuedo ldtilecfg
+ Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
+
+ addFrameReference(
+ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
+
+ return VReg;
+}
+
+static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected machine instruction on tile");
+ case X86::PTILELOADDV:
+ case X86::PTDPBSSDV:
+ MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
+ MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
+ ShapeT Shape(&MO1, &MO2, MRI);
+ return Shape;
+ }
+}
+
+MachineInstr *X86PreTileConfig::getTileConfigPoint() {
+ DenseMap<Register, ShapeT> PhysShapeInfo;
+ MachineBasicBlock *MBB = nullptr;
+ DenseSet<const MachineInstr *> MIs;
+ for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+ Register VirtReg = Register::index2VirtReg(i);
+ if (MRI->reg_nodbg_empty(VirtReg))
+ continue;
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ if (RC.getID() != X86::TILERegClassID)
+ continue;
+
+ // Find the common dominator for all MI that define tile register.
+ for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
+ if (MO.isUndef())
+ continue;
+ const auto *MI = MO.getParent();
+ // PHI or IMPLICIT_DEF instructiion.
+ // There must be a input tile before PHI instruction.
+ if (MI->isTransient())
+ continue;
+ if (!MBB)
+ MBB = const_cast<MachineBasicBlock *>(MI->getParent());
+ MBB = DomTree->findNearestCommonDominator(
+ MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
+
+ // Collect the instructions that define shape.
+ ShapeT Shape = getShape(*MI, MRI);
+ std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
+ Shape.getCol()};
+ for (auto *ShapeMO : ShapeMOs) {
+ Register ShapeReg = ShapeMO->getReg();
+ for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
+ const auto *ShapeMI = MO.getParent();
+ MIs.insert(ShapeMI);
+ }
+ }
+ }
+ }
+ if (!MBB)
+ return nullptr;
+ // This pass is before the pass of eliminating PHI node, so it
+ // is in SSA form.
+ assert(MRI->isSSA() && "Not SSA form in pre-tile config");
+ // Shape def should dominate tile config MBB.
+ // def s s1 s2
+ // / \ \ /
+ // / \ \ /
+ // conf s3=phi(s1,s2)
+ // |
+ // c
+ //
+ for (const auto *MI : MIs) {
+ const MachineBasicBlock *ShapeMBB = MI->getParent();
+ if (DomTree->dominates(ShapeMBB, MBB))
+ continue;
+ if (MI->isMoveImmediate())
+ continue;
+ report_fatal_error(MF->getName() + ": Failed to config tile register, "
+ "please define the shape earlier");
+ }
+
+ // ldtilecfg should be inserted after the MI that define the shape.
+ MachineBasicBlock::reverse_instr_iterator I, E;
+ for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
+ auto *MI = &*I;
+ if (MIs.count(MI) && (!MI->isMoveImmediate()))
+ break;
+ }
+ MachineBasicBlock::iterator MII;
+ if (I == E)
+ MII = MBB->getFirstNonPHI();
+ else {
+ MII = MachineBasicBlock::iterator(&*I);
+ MII++;
+ }
+ return &*MII;
+}
+
+static void addTileCFGUse(MachineFunction &MF, Register CFG) {
+ for (MachineBasicBlock &MBB : MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ break;
+ case X86::PTILELOADDV:
+ case X86::PTILESTOREDV:
+ case X86::PTDPBSSDV:
+ unsigned NumOperands = MI.getNumOperands();
+ MI.RemoveOperand(NumOperands - 1);
+ MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
+ break;
+ }
+ }
+ }
+}
+
+bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ MRI = &mf.getRegInfo();
+ ST = &mf.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = mf.getSubtarget().getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+
+ MachineInstr *MI = getTileConfigPoint();
+ if (!MI)
+ return false;
+ unsigned Size = ST->getTileConfigSize();
+ Align Alignment = ST->getTileConfigAlignment();
+ int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
+ Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
+ addTileCFGUse(mf, CFG);
+ return true;
+}
+
+FunctionPass *llvm::createX86PreTileConfigPass() {
+ return new X86PreTileConfig();
+}
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 94c22a535889..81571decae2d 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -19,6 +19,7 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -856,3 +857,78 @@ X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
StackReg = getX86SubSuperRegister(StackReg, 32);
return StackReg;
}
+
+static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
+ const MachineRegisterInfo *MRI) {
+ if (VRM->hasShape(VirtReg))
+ return VRM->getShape(VirtReg);
+
+ const MachineOperand &Def = *MRI->def_begin(VirtReg);
+ MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent());
+ unsigned OpCode = MI->getOpcode();
+ switch (OpCode) {
+ default:
+ llvm_unreachable("Unexpected machine instruction on tile register!");
+ break;
+ // We only collect the tile shape that is defined.
+ case X86::PTILELOADDV:
+ case X86::PTDPBSSDV:
+ MachineOperand &MO1 = MI->getOperand(1);
+ MachineOperand &MO2 = MI->getOperand(2);
+ ShapeT Shape(&MO1, &MO2, MRI);
+ VRM->assignVirt2Shape(VirtReg, Shape);
+ return Shape;
+ }
+}
+
+bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+ VirtReg, Order, Hints, MF, VRM, Matrix);
+
+ if (RC.getID() != X86::TILERegClassID)
+ return BaseImplRetVal;
+
+ ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
+ auto AddHint = [&](MCPhysReg PhysReg) {
+ Register VReg = Matrix->getOneVReg(PhysReg);
+ if (VReg == MCRegister::NoRegister) { // Not allocated yet
+ Hints.push_back(PhysReg);
+ return;
+ }
+ ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
+ if (PhysShape == VirtShape)
+ Hints.push_back(PhysReg);
+ };
+
+ SmallSet<MCPhysReg, 4> CopyHints;
+ CopyHints.insert(Hints.begin(), Hints.end());
+ Hints.clear();
+ for (auto Hint : CopyHints) {
+ if (RC.contains(Hint) && !MRI->isReserved(Hint))
+ AddHint(Hint);
+ }
+ for (MCPhysReg PhysReg : Order) {
+ if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) &&
+ !MRI->isReserved(PhysReg))
+ AddHint(PhysReg);
+ }
+
+#define DEBUG_TYPE "tile-hint"
+ LLVM_DEBUG({
+ dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n";
+ for (auto Hint : Hints) {
+ dbgs() << "tmm" << Hint << ",";
+ }
+ dbgs() << "\n";
+ });
+#undef DEBUG_TYPE
+
+ return true;
+}
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index d23cf7f48ad9..7fd10ddd1a15 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -144,6 +144,11 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
Register getFramePtr() const { return FramePtr; }
// FIXME: Move to FrameInfok
unsigned getSlotSize() const { return SlotSize; }
+
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 8de5b94bbffa..3c8c34985c6a 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -265,6 +265,9 @@ let SubRegIndices = [sub_ymm] in {
}
}
+// Tile config registers.
+def TMMCFG: X86Reg<"tmmcfg", 0>;
+
// Tile "registers".
def TMM0: X86Reg<"tmm0", 0>;
def TMM1: X86Reg<"tmm1", 1>;
@@ -633,6 +636,11 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
// Tiles
-let isAllocatable = 0 in
-def TILE : RegisterClass<"X86", [untyped], 0,
+let CopyCost = -1 in // Don't allow copying of tile registers
+def TILE : RegisterClass<"X86", [v256i32], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
+def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
+ let CopyCost = -1; // Don't allow copying of tile config registers.
+ let isAllocatable = 1;
+ let Size = 512;
+}
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 0b2362d6c10d..fa2622333d60 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -472,6 +472,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// entry to the function and which must be maintained by every function.
Align stackAlignment = Align(4);
+ Align TileConfigAlignment = Align(4);
+
/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
///
// FIXME: this is a known good value for Yonah. How about others?
@@ -555,6 +557,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
return &getInstrInfo()->getRegisterInfo();
}
+ unsigned getTileConfigSize() const { return 64; }
+ Align getTileConfigAlignment() const { return TileConfigAlignment; }
+
/// Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 288841c4916c..c8f76c210a3f 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -62,6 +62,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
PassRegistry &PR = *PassRegistry::getPassRegistry();
+ initializeX86LowerAMXTypeLegacyPassPass(PR);
initializeGlobalISel(PR);
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
@@ -71,6 +72,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86FixupSetCCPassPass(PR);
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
+ initializeX86TileConfigPass(PR);
initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
@@ -379,6 +381,7 @@ class X86PassConfig : public TargetPassConfig {
void addPreEmitPass() override;
void addPreEmitPass2() override;
void addPreSched2() override;
+ bool addPreRewrite() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
@@ -407,6 +410,7 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
void X86PassConfig::addIRPasses() {
addPass(createAtomicExpandPass());
+ addPass(createX86LowerAMXTypePass());
TargetPassConfig::addIRPasses();
@@ -492,7 +496,12 @@ void X86PassConfig::addPreRegAlloc() {
addPass(createX86SpeculativeLoadHardeningPass());
addPass(createX86FlagsCopyLoweringPass());
addPass(createX86WinAllocaExpander());
+
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createX86PreTileConfigPass());
+ }
}
+
void X86PassConfig::addMachineSSAOptimization() {
addPass(createX86DomainReassignmentPass());
TargetPassConfig::addMachineSSAOptimization();
@@ -565,6 +574,11 @@ void X86PassConfig::addPreEmitPass2() {
addPass(createX86LoadValueInjectionRetHardeningPass());
}
+bool X86PassConfig::addPreRewrite() {
+ addPass(createX86TileConfigPass());
+ return true;
+}
+
std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
return getStandardCSEConfigForOpt(TM->getOptLevel());
}
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
new file mode 100644
index 000000000000..ef010bcd38b7
--- /dev/null
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -0,0 +1,248 @@
+//===-- X86TileConfig.cpp - Tile Register Configure----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to config the shape of AMX physical registers
+/// AMX register need to be configured before use. In X86PreTileConfig pass
+/// the pldtilecfg instruction is inserted, however at that time we don't
+/// know the shape of each physical tile registers, because the register
+/// allocation is not done yet. This pass runs after egister allocation
+/// pass. It collects the shape information of each physical tile register
+/// and store the shape in the stack slot that is allocated for load config
+/// to tile config register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-config"
+
+namespace {
+
+class X86TileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ VirtRegMap *VRM = nullptr;
+ LiveIntervals *LIS = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+ void tileConfig();
+
+public:
+ X86TileConfig() : MachineFunctionPass(ID) {}
+
+ /// Return the pass name.
+ StringRef getPassName() const override { return "Tile Register Configure"; }
+
+ /// X86TileConfig analysis usage.
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &mf) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoPHIs);
+ }
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86TileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
+ false, false)
+
+void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addRequired<VirtRegMap>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static unsigned getTilePhysRegIndex(Register PhysReg) {
+ assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) &&
+ "Tile register number is invalid");
+ return (PhysReg - X86::TMM0);
+}
+
+static MachineInstr *
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ Register SrcReg, unsigned BitSize, int FrameIdx, int Offset,
+ const TargetInstrInfo *TII, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) {
+
+ unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit;
+ unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr;
+ if (BitSize == TRI->getRegSizeInBits(*RC))
+ SubIdx = 0;
+ MachineInstr *NewMI =
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx,
+ Offset)
+ .addReg(SrcReg, 0, SubIdx);
+ return NewMI;
+}
+
+static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ int64_t Imm, unsigned BitSize,
+ int FrameIdx, int Offset,
+ const TargetInstrInfo *TII) {
+ unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi;
+ return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)),
+ FrameIdx, Offset)
+ .addImm(Imm);
+}
+
+MachineInstr *X86TileConfig::getTileConfigPoint() {
+ for (MachineBasicBlock &MBB : *MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB)
+ // Refer X86PreTileConfig.cpp.
+ // We only support one tile config for now.
+ if (MI.getOpcode() == X86::PLDTILECFG)
+ return &MI;
+ }
+
+ return nullptr;
+}
+
+void X86TileConfig::tileConfig() {
+ MachineInstr *MI = getTileConfigPoint();
+ if (!MI)
+ return;
+ MachineBasicBlock *MBB = MI->getParent();
+ int SS = MI->getOperand(1).getIndex();
+ BitVector PhysRegs(TRI->getNumRegs());
+
+ // Fill in the palette first.
+ auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII);
+ LIS->InsertMachineInstrInMaps(*NewMI);
+ // Fill in the shape of each tile physical register.
+ for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+ Register VirtReg = Register::index2VirtReg(i);
+ if (MRI->reg_nodbg_empty(VirtReg))
+ continue;
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ if (RC.getID() != X86::TILERegClassID)
+ continue;
+ Register PhysReg = VRM->getPhys(VirtReg);
+ if (PhysRegs.test(PhysReg))
+ continue;
+ PhysRegs.set(PhysReg);
+ ShapeT Shape = VRM->getShape(VirtReg);
+ Register RowReg = Shape.getRow()->getReg();
+ Register ColReg = Shape.getCol()->getReg();
+
+ // Here is the data format for the tile config.
+ // 0 palette
+ // 1 start_row
+ // 2-15 reserved, must be zero
+ // 16-17 tile0.colsb Tile 0 bytes per row.
+ // 18-19 tile1.colsb Tile 1 bytes per row.
+ // 20-21 tile2.colsb Tile 2 bytes per row.
+ // ... (sequence continues)
+ // 30-31 tile7.colsb Tile 7 bytes per row.
+ // 32-47 reserved, must be zero
+ // 48 tile0.rows Tile 0 rows.
+ // 49 tile1.rows Tile 1 rows.
+ // 50 tile2.rows Tile 2 rows.
+ // ... (sequence continues)
+ // 55 tile7.rows Tile 7 rows.
+ // 56-63 reserved, must be zero
+ unsigned Index = getTilePhysRegIndex(PhysReg);
+ int RowOffset = 48 + Index;
+ int ColOffset = 16 + Index * 2;
+
+ unsigned BitSize = 8;
+ for (const auto &Pair : {std::make_pair(RowReg, RowOffset),
+ std::make_pair(ColReg, ColOffset)}) {
+ int64_t Imm;
+ int ImmCount = 0;
+ // All def must be the same value, otherwise it is invalid MIs.
+ // Immediate is prefered.
+ for (const MachineOperand &MO : MRI->def_operands(Pair.first)) {
+ const auto *Inst = MO.getParent();
+ if (Inst->isMoveImmediate()) {
+ ImmCount++;
+ Imm = Inst->getOperand(1).getImm();
+ break;
+ }
+ }
+ auto StoreConfig = [&](int Offset) {
+ MachineInstr *NewMI = nullptr;
+ if (ImmCount)
+ NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII);
+ else {
+ const TargetRegisterClass *RC = MRI->getRegClass(Pair.first);
+ NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS,
+ Offset, TII, RC, TRI);
+ }
+ SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI);
+ if (!ImmCount) {
+ // Extend the live interval.
+ SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()};
+ LiveInterval &Int = LIS->getInterval(Pair.first);
+ LIS->extendToIndices(Int, EndPoints);
+ }
+ };
+ StoreConfig(Pair.second);
+ BitSize += 8;
+ }
+ }
+}
+
+bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ MRI = &mf.getRegInfo();
+ ST = &mf.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = mf.getSubtarget().getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+ VRM = &getAnalysis<VirtRegMap>();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ if (VRM->isShapeMapEmpty())
+ return false;
+
+ tileConfig();
+ return true;
+}
+
+FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
new file mode 100644
index 000000000000..5ef0d02ebfc5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
+
+ at buf = dso_local global [3072 x i8] zeroinitializer, align 16
+
+define dso_local void @test_api(i16 signext %0, i16 signext %1) local_unnamed_addr #2 {
+; CHECK-LABEL: test_api:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT: .cfi_def_cfa_offset 4096
+; CHECK-NEXT: .cfi_offset %rbx, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edi, %ebp
+; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
+; CHECK-NEXT: movl $buf, %eax
+; CHECK-NEXT: movl $32, %r14d
+; CHECK-NEXT: movw $8, %r15w
+; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1
+; CHECK-NEXT: movabsq $64, %rax
+; CHECK-NEXT: tilestored %tmm1, 2048(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT: movl $buf+1024, %eax
+; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
+; CHECK-NEXT: movabsq $64, %rax
+; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: movl $buf+2048, %eax
+; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
+; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
+; CHECK-NEXT: movabsq $64, %rcx
+; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
+; CHECK-NEXT: movabsq $64, %rcx
+; CHECK-NEXT: tileloadd 1024(%rsp,%rcx), %tmm2 # 1024-byte Folded Reload
+; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; CHECK-NEXT: tilestored %tmm0, (%rax,%r14)
+; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: retq
+ %3 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) #4
+ %4 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) #4
+ tail call void (...) @foo() #4
+ %5 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) #4
+ %6 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, <256 x i32> %5, <256 x i32> %3, <256 x i32> %4) #4
+ tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, <256 x i32> %6) #4
+ ret void
+}
+
+declare dso_local void @foo(...) local_unnamed_addr #3
+
+declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #4
+declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #4
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #4
+
+attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-config.ll b/llvm/test/CodeGen/X86/AMX/amx-config.ll
new file mode 100644
index 000000000000..d730f2bc851e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-config.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at buf = dso_local global [1024 x i8] zeroinitializer, align 16
+ at buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) local_unnamed_addr #2 {
+; CHECK-LABEL: test_api:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movsbl %sil, %eax
+; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: movl $buf, %ecx
+; CHECK-NEXT: jmp .LBB0_3
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: movl $buf2, %ecx
+; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT: movl $32, %edi
+; CHECK-NEXT: tileloadd (%rcx,%rdi), %tmm0
+; CHECK-NEXT: tileloadd (%rcx,%rdi), %tmm2
+; CHECK-NEXT: tileloadd (%rcx,%rdi), %tmm1
+; CHECK-NEXT: tdpbssd %tmm2, %tmm0, %tmm1
+; CHECK-NEXT: movl $buf, %ecx
+; CHECK-NEXT: movl $32, %esi
+; CHECK-NEXT: tilestored %tmm1, (%rcx,%rsi)
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %4 = icmp eq i32 %0, 0
+ %5 = shl i16 %1, 8
+ %6 = ashr exact i16 %5, 8
+ br i1 %4, label %11, label %7
+
+7: ; preds = %3
+ %8 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %9 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %10 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ br label %15
+
+11: ; preds = %3
+ %12 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
+ %13 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
+ %14 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
+ br label %15
+
+15: ; preds = %11, %7
+ %16 = phi <256 x i32> [ %12, %11 ], [ %8, %7 ]
+ %17 = phi <256 x i32> [ %13, %11 ], [ %9, %7 ]
+ %18 = phi <256 x i32> [ %14, %11 ], [ %10, %7 ]
+ %19 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %6, i16 %2, i16 %1, <256 x i32> %18, <256 x i32> %16, <256 x i32> %17) #3
+ tail call void @llvm.x86.tilestored64.internal(i16 %6, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, <256 x i32> %19) #3
+ ret void
+}
+
+declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #3
+
+declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #3
+
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #3
+
+attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+avx,+avx2,+avx512f,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill.ll b/llvm/test/CodeGen/X86/AMX/amx-spill.ll
new file mode 100644
index 000000000000..6aee3e6fab58
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at buf = dso_local global [1024 x i8] zeroinitializer, align 16
+ at buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
+
+define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) local_unnamed_addr #2 {
+; CHECK-LABEL: test_api:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $2936, %rsp # imm = 0xB78
+; CHECK-NEXT: .cfi_def_cfa_offset 2944
+; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $buf, %r8d
+; CHECK-NEXT: movl $32, %eax
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
+; CHECK-NEXT: movabsq $64, %rcx
+; CHECK-NEXT: tilestored %tmm1, 896(%rsp,%rcx) # 1024-byte Folded Spill
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm3
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm4
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm2
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm5
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm0
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm6
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm7
+; CHECK-NEXT: tileloadd (%r8,%rax), %tmm1
+; CHECK-NEXT: jmp .LBB0_3
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: movl $buf2, %ecx
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm6
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm7
+; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm1
+; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT: tdpbssd %tmm7, %tmm6, %tmm1
+; CHECK-NEXT: movabsq $64, %rax
+; CHECK-NEXT: tileloadd 896(%rsp,%rax), %tmm7 # 1024-byte Folded Reload
+; CHECK-NEXT: tdpbssd %tmm7, %tmm1, %tmm3
+; CHECK-NEXT: tdpbssd %tmm4, %tmm3, %tmm2
+; CHECK-NEXT: tdpbssd %tmm5, %tmm2, %tmm0
+; CHECK-NEXT: movl $buf, %eax
+; CHECK-NEXT: movl $32, %ecx
+; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx)
+; CHECK-NEXT: addq $2936, %rsp # imm = 0xB78
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %4 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %5 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %6 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %7 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %8 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %9 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %10 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %11 = icmp eq i32 %0, 0
+ br i1 %11, label %16, label %12
+
+12: ; preds = %3
+ %13 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %14 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ %15 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) #3
+ br label %20
+
+16: ; preds = %3
+ %17 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
+ %18 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
+ %19 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) #3
+ br label %20
+
+20: ; preds = %16, %12
+ %21 = phi <256 x i32> [ %17, %16 ], [ %13, %12 ]
+ %22 = phi <256 x i32> [ %18, %16 ], [ %14, %12 ]
+ %23 = phi <256 x i32> [ %19, %16 ], [ %15, %12 ]
+ %24 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %1, <256 x i32> %23, <256 x i32> %21, <256 x i32> %22) #3
+ %25 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, <256 x i32> %6, <256 x i32> %24, <256 x i32> %5) #3
+ %26 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, <256 x i32> %8, <256 x i32> %25, <256 x i32> %7) #3
+ %27 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %2, i16 %2, i16 %2, <256 x i32> %10, <256 x i32> %26, <256 x i32> %9) #3
+ tail call void @llvm.x86.tilestored64.internal(i16 %2, i16 %2, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, <256 x i32> %27) #3
+ ret void
+}
+
+declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #3
+declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #3
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #3
+
+attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-type.ll b/llvm/test/CodeGen/X86/AMX/amx-type.ll
new file mode 100644
index 000000000000..2caae116d722
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-amx-type %s -S | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.__tile_str = type { i16, i16, <256 x i32> }
+
+ at buf = dso_local global [1024 x i8] zeroinitializer, align 16
+ at buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
+
+define dso_local void @test_load(i8* %in, i8* %out) local_unnamed_addr #2 {
+; CHECK-LABEL: @test_load(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[IN:%.*]] to <256 x i32>*
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[OUT:%.*]] to <256 x i32>*
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i32>* [[TMP1]] to <128 x i32>*
+; CHECK-NEXT: [[TMP4:%.*]] = load <128 x i32>, <128 x i32>* [[TMP3]], align 64
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr <128 x i32>, <128 x i32>* [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = load <128 x i32>, <128 x i32>* [[TMP5]], align 64
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <256 x i32>* [[TMP2]] to <128 x i32>*
+; CHECK-NEXT: store <128 x i32> [[TMP4]], <128 x i32>* [[TMP7]], align 64
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <128 x i32>, <128 x i32>* [[TMP7]], i32 1
+; CHECK-NEXT: store <128 x i32> [[TMP6]], <128 x i32>* [[TMP8]], align 64
+; CHECK-NEXT: ret void
+;
+ %1 = bitcast i8* %in to <256 x i32>*
+ %2 = bitcast i8* %out to <256 x i32>*
+ %3 = load <256 x i32>, <256 x i32>* %1, align 64, !tbaa !8
+ store <256 x i32> %3, <256 x i32>* %2, align 64, !tbaa !8
+ ret void
+}
+
+define dso_local void @__tile_loadd(%struct.__tile_str* nocapture %0, i8* %1, i64 %2) local_unnamed_addr #0 {
+; CHECK-LABEL: @__tile_loadd(
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 64, [[TBAA2:!tbaa !.*]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2, [[TBAA7:!tbaa !.*]]
+; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP2:%.*]], 32
+; CHECK-NEXT: [[TMP9:%.*]] = ashr exact i64 [[TMP8]], 32
+; CHECK-NEXT: [[TMP10:%.*]] = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP1:%.*]], i64 [[TMP9]]) [[ATTR3:#.*]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 2
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP11]] to i8*
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP12]], i64 64, <256 x i32> [[TMP10]])
+; CHECK-NEXT: ret void
+;
+ %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 0
+ %5 = load i16, i16* %4, align 64, !tbaa !2
+ %6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 1
+ %7 = load i16, i16* %6, align 2, !tbaa !7
+ %8 = shl i64 %2, 32
+ %9 = ashr exact i64 %8, 32
+ %10 = tail call <256 x i32> @llvm.x86.tileloadd64.internal(i16 %5, i16 %7, i8* %1, i64 %9) #3
+ %11 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2
+ store <256 x i32> %10, <256 x i32>* %11, align 64, !tbaa !8
+ ret void
+}
+
+define dso_local void @__tile_dpbsud(%struct.__tile_str* nocapture %0, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr #0 {
+; CHECK-LABEL: @__tile_dpbsud(
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 64, [[TBAA2]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2, [[TBAA7]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP8]], align 2, [[TBAA7]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 2
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8*
+; CHECK-NEXT: [[TMP12:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP11]], i64 64)
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 2
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <256 x i32>* [[TMP13]] to i8*
+; CHECK-NEXT: [[TMP15:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP9]], i8* [[TMP14]], i64 64)
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <256 x i32>* [[TMP16]] to i8*
+; CHECK-NEXT: [[TMP18:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP7]], i8* [[TMP17]], i64 64)
+; CHECK-NEXT: [[TMP19:%.*]] = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 [[TMP5]], i16 [[TMP7]], i16 [[TMP9]], <256 x i32> [[TMP12]], <256 x i32> [[TMP15]], <256 x i32> [[TMP18]]) [[ATTR3]]
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8*
+; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP20]], i64 64, <256 x i32> [[TMP19]])
+; CHECK-NEXT: ret void
+;
+ %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 0
+ %5 = load i16, i16* %4, align 64, !tbaa !2
+ %6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1
+ %7 = load i16, i16* %6, align 2, !tbaa !7
+ %8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 1
+ %9 = load i16, i16* %8, align 2, !tbaa !7
+ %10 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2
+ %11 = load <256 x i32>, <256 x i32>* %10, align 64, !tbaa !8
+ %12 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 2
+ %13 = load <256 x i32>, <256 x i32>* %12, align 64, !tbaa !8
+ %14 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2
+ %15 = load <256 x i32>, <256 x i32>* %14, align 64, !tbaa !8
+ %16 = tail call <256 x i32> @llvm.x86.tdpbssd.internal(i16 %5, i16 %7, i16 %9, <256 x i32> %11, <256 x i32> %13, <256 x i32> %15) #3
+ store <256 x i32> %16, <256 x i32>* %10, align 64, !tbaa !8
+ ret void
+}
+
+define dso_local void @__tile_stored(i8* %0, i64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr #1 {
+; CHECK-LABEL: @__tile_stored(
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 64, [[TBAA2]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2, [[TBAA7]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP8]] to i8*
+; CHECK-NEXT: [[TMP10:%.*]] = call <256 x i32> @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP9]], i64 64)
+; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP1:%.*]], 32
+; CHECK-NEXT: [[TMP12:%.*]] = ashr exact i64 [[TMP11]], 32
+; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP0:%.*]], i64 [[TMP12]], <256 x i32> [[TMP10]]) [[ATTR3]]
+; CHECK-NEXT: ret void
+;
+ %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 0
+ %5 = load i16, i16* %4, align 64, !tbaa !2
+ %6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1
+ %7 = load i16, i16* %6, align 2, !tbaa !7
+ %8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2
+ %9 = load <256 x i32>, <256 x i32>* %8, align 64, !tbaa !8
+ %10 = shl i64 %1, 32
+ %11 = ashr exact i64 %10, 32
+ tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %7, i8* %0, i64 %11, <256 x i32> %9) #3
+ ret void
+}
+
+declare <256 x i32> @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #3
+declare <256 x i32> @llvm.x86.tdpbssd.internal(i16, i16, i16, <256 x i32>, <256 x i32>, <256 x i32>) #3
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, <256 x i32>) #3
+
+attributes #0 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="8192" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-int8,+amx-tile,+avx,+avx2,+avx512f,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 12.0.0 (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_project_worldread f3c78a3f053379a2511e00e9ce2c13383ea3f835)"}
+!2 = !{!3, !4, i64 0}
+!3 = !{!"__tile_str", !4, i64 0, !4, i64 2, !5, i64 1024}
+!4 = !{!"short", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!3, !4, i64 2}
+!8 = !{!5, !5, i64 0}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 528b3c39c879..40349b869b2c 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,6 +18,7 @@
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand Atomic instructions
+; CHECK-NEXT: Lower AMX type for load/store
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
index 8603e3455acf..cc67c195be63 100644
--- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
@@ -3,7 +3,7 @@
target triple = "x86_64-unknown-unknown"
declare void @bar1()
define preserve_allcc void @foo()#0 {
-; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $fpcw $fpsw $fs $gs $hip $ip $mxcsr $rip $riz $ss $ssp $bnd0 $bnd1 $bnd2 $bnd3 $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $r11b $r11bh $r11d $r11w $r11wh $k0_k1 $k2_k3 $k4_k5 $k6_k7
+; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $fpcw $fpsw $fs $gs $hip $ip $mxcsr $rip $riz $ss $ssp $tmmcfg $bnd0 $bnd1 $bnd2 $bnd3 $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $r11b $r11bh $r11d $r11w $r11wh $k0_k1 $k2_k3 $k4_k5 $k6_k7
call void @bar1()
call void @bar2()
ret void
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index f44a7cdad3c7..b851eea60b0a 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -24,6 +24,7 @@
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand Atomic instructions
+; CHECK-NEXT: Lower AMX type for load/store
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
@@ -118,11 +119,12 @@
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: X86 EFLAGS copy lowering
; CHECK-NEXT: X86 WinAlloca Expander
+; CHECK-NEXT: MachineDominator Tree Construction
+; CHECK-NEXT: Tile Register Pre-configure
; CHECK-NEXT: Detect Dead Lanes
; CHECK-NEXT: Process Implicit Definitions
; CHECK-NEXT: Remove unreachable machine basic blocks
; CHECK-NEXT: Live Variable Analysis
-; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Machine Natural Loop Construction
; CHECK-NEXT: Eliminate PHI nodes for register allocation
; CHECK-NEXT: Two-Address instruction pass
@@ -141,6 +143,7 @@
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Greedy Register Allocator
+; CHECK-NEXT: Tile Register Configure
; CHECK-NEXT: Virtual Register Rewriter
; CHECK-NEXT: Stack Slot Coloring
; CHECK-NEXT: Machine Copy Propagation Pass
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-invoke.mir b/llvm/test/CodeGen/X86/statepoint-fixup-invoke.mir
index 5f1ca4f80f1f..cbeb1f9279f8 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-invoke.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-invoke.mir
@@ -91,7 +91,7 @@ body: |
; CHECK-DAG: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $rdi :: (store 8 into %stack.1)
; CHECK: EH_LABEL <mcsymbol .Ltmp0>
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
- ; CHECK: STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 5, 2, 0, 2, -1, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on %stack.1), (load store 8 on %stack.0)
+ ; CHECK: STATEPOINT 0, 0, 1, @some_call, $rdi, 2, 0, 2, 0, 2, 5, 2, 0, 2, -1, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on %stack.0), (load store 8 on %stack.1)
; CHECK-DAG: $r14 = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load 8 from %stack.0)
; CHECK-DAG: $rbx = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load 8 from %stack.1)
; CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-shared-ehpad.mir b/llvm/test/CodeGen/X86/statepoint-fixup-shared-ehpad.mir
index edd3beada157..29e35f05a357 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-shared-ehpad.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-shared-ehpad.mir
@@ -108,7 +108,7 @@ body: |
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK: MOV64mr [[STACK0:%stack.[0-9]+]], 1, $noreg, 0, $noreg, killed $rbx :: (store 8 into [[STACK0]])
; CHECK: MOV64mr [[STACK1:%stack.[0-9]+]], 1, $noreg, 0, $noreg, killed $r14 :: (store 8 into [[STACK1]])
- ; CHECK: STATEPOINT 0, 0, 0, @foo, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, [[STACK0]], 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK0]]), (load store 8 on [[STACK1]])
+ ; CHECK: STATEPOINT 0, 0, 0, @foo, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, [[STACK0]], 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK1]]), (load store 8 on [[STACK0]])
; CHECK-DAG: $rbx = MOV64rm [[STACK0]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK0]])
; CHECK-DAG: $r14 = MOV64rm [[STACK1]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK1]])
; CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
@@ -121,7 +121,7 @@ body: |
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-DAG: MOV64mr [[STACK0]], 1, $noreg, 0, $noreg, killed $rbx :: (store 8 into [[STACK0]])
; CHECK-DAG: MOV64mr [[STACK1]], 1, $noreg, 0, $noreg, killed $r14 :: (store 8 into [[STACK1]])
- ; CHECK: STATEPOINT 0, 0, 0, @bar, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK0]]), (load store 8 on [[STACK1]])
+ ; CHECK: STATEPOINT 0, 0, 0, @bar, 2, 0, 2, 0, 2, 0, 2, 2, 1, 8, %stack.0, 0, 1, 8, [[STACK1]], 0, 2, 0, 2, 2, 0, 0, 1, 1, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store 8 on [[STACK1]]), (load store 8 on [[STACK0]])
; CHECK-DAG: $rbx = MOV64rm [[STACK0]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK0]])
; CHECK-DAG: $r14 = MOV64rm [[STACK1]], 1, $noreg, 0, $noreg :: (load 8 from [[STACK1]])
; CHECK: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
More information about the llvm-branch-commits
mailing list