[clang] 939d830 - [X86-64] Support Intel AMX Intrinsic
Xiang1 Zhang via cfe-commits
cfe-commits at lists.llvm.org
Mon Jul 6 19:14:25 PDT 2020
Author: Xiang1 Zhang
Date: 2020-07-07T10:13:40+08:00
New Revision: 939d8309dbd4ee6cf6e9ef3e8ea26df008b006b4
URL: https://github.com/llvm/llvm-project/commit/939d8309dbd4ee6cf6e9ef3e8ea26df008b006b4
DIFF: https://github.com/llvm/llvm-project/commit/939d8309dbd4ee6cf6e9ef3e8ea26df008b006b4.diff
LOG: [X86-64] Support Intel AMX Intrinsic
INTEL ADVANCED MATRIX EXTENSIONS (AMX).
AMX is a new programming paradigm, it has a set of 2-dimensional registers
(TILES) representing sub-arrays from a larger 2-dimensional memory image and
operate on TILES.
These intrinsics use direct TMM register number as its params.
Spec can be found in Chapter 3 here https://software.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html
Reviewed By: craig.topper
Differential Revision: https://reviews.llvm.org/D83111
Added:
clang/lib/Headers/amxintrin.h
clang/test/CodeGen/AMX/amx.c
clang/test/CodeGen/AMX/amx_errors.c
clang/test/CodeGen/AMX/amx_inline_asm.c
clang/test/Preprocessor/x86_amx_target_features.c
llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll
llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll
Modified:
clang/docs/ClangCommandLineReference.rst
clang/include/clang/Basic/BuiltinsX86_64.def
clang/include/clang/Basic/DiagnosticSemaKinds.td
clang/include/clang/Driver/Options.td
clang/include/clang/Sema/Sema.h
clang/lib/Basic/Targets/X86.cpp
clang/lib/Basic/Targets/X86.h
clang/lib/Headers/CMakeLists.txt
clang/lib/Headers/cpuid.h
clang/lib/Headers/immintrin.h
clang/lib/Sema/SemaChecking.cpp
clang/test/Driver/x86-target-features.c
llvm/include/llvm/IR/IntrinsicsX86.td
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrAMX.td
Removed:
################################################################################
diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst
index 67c341feffbb..672c4ae80e73 100644
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -3127,6 +3127,12 @@ X86
.. option:: -maes, -mno-aes
+.. option:: -mamx-bf16, -mno-amx-bf16
+
+.. option:: -mamx-int8, -mno-amx-int8
+
+.. option:: -mamx-tile, -mno-amx-tile
+
.. option:: -mavx, -mno-avx
.. option:: -mavx2, -mno-avx2
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index c535f43203e5..7feccd2a81a0 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -101,6 +101,22 @@ TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fOiIi", "ncV:128:", "avx512f")
TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dUOiIi", "ncV:128:", "avx512f")
TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fUOiIi", "ncV:128:", "avx512f")
TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri")
+
+// AMX
+TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")
+
+TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tilestored64, "vIUcv*z", "n", "amx-tile")
+
+TARGET_BUILTIN(__builtin_ia32_tdpbssd, "vIUcIUcIUc", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbsud, "vIUcIUcIUc", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbusd, "vIUcIUcIUc", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbuud, "vIUcIUcIUc", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16")
TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
#undef BUILTIN
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5b94aa8c4325..c935545610e0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9342,6 +9342,8 @@ def err_x86_builtin_invalid_rounding : Error<
"invalid rounding argument">;
def err_x86_builtin_invalid_scale : Error<
"scale argument must be 1, 2, 4, or 8">;
+def err_x86_builtin_tile_arg_duplicate : Error<
+ "tile arguments must refer to
diff erent tiles">;
def err_builtin_target_unsupported : Error<
"builtin is not supported on this target">;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 50d18343f7d4..745c696bcaa3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3065,6 +3065,12 @@ def m3dnow : Flag<["-"], "m3dnow">, Group<m_x86_Features_Group>;
def mno_3dnow : Flag<["-"], "mno-3dnow">, Group<m_x86_Features_Group>;
def m3dnowa : Flag<["-"], "m3dnowa">, Group<m_x86_Features_Group>;
def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group<m_x86_Features_Group>;
+def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>;
+def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>;
+def mtamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
+def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>;
+def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
+def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
def mno_sse : Flag<["-"], "mno-sse">, Group<m_x86_Features_Group>;
def msse2 : Flag<["-"], "msse2">, Group<m_x86_Features_Group>;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 9b82d2c984be..8ee7dd74712d 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -12142,6 +12142,13 @@ class Sema final {
bool CheckSystemZBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
bool CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall);
bool CheckX86BuiltinGatherScatterScale(unsigned BuiltinID, CallExpr *TheCall);
+ bool CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall);
+ bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
+ ArrayRef<int> ArgNums);
+ bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, int ArgNum);
+ bool CheckX86BuiltinTileDuplicate(CallExpr *TheCall, ArrayRef<int> ArgNums);
+ bool CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall,
+ ArrayRef<int> ArgNums);
bool CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
CallExpr *TheCall);
bool CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 2c6742b9042a..ed62848d8070 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -62,6 +62,7 @@ static const char *const GCCRegNames[] = {
"cr0", "cr2", "cr3", "cr4", "cr8",
"dr0", "dr1", "dr2", "dr3", "dr6", "dr7",
"bnd0", "bnd1", "bnd2", "bnd3",
+ "tmm0", "tmm1", "tmm2", "tmm3", "tmm4", "tmm5", "tmm6", "tmm7",
};
const TargetInfo::AddlRegName AddlRegNames[] = {
@@ -394,7 +395,10 @@ void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
} else if (Name == "xsaveopt" || Name == "xsavec" || Name == "xsaves") {
if (Enabled)
Features["xsave"] = true;
- }
+ } else if (Name == "amx-tile" && !Enabled) {
+ Features["amx-bf16"] = Features["amx-int8"] = false;
+ } else if ((Name == "amx-bf16" || Name == "amx-int8") && Enabled)
+ Features["amx-tile"] = true;
}
/// handleTargetFeatures - Perform initialization based on the user
@@ -529,6 +533,12 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasINVPCID = true;
} else if (Feature == "+enqcmd") {
HasENQCMD = true;
+ } else if (Feature == "+amx-bf16") {
+ HasAMXBF16 = true;
+ } else if (Feature == "+amx-int8") {
+ HasAMXINT8 = true;
+ } else if (Feature == "+amx-tile") {
+ HasAMXTILE = true;
} else if (Feature == "+serialize") {
HasSERIALIZE = true;
} else if (Feature == "+tsxldtrk") {
@@ -924,6 +934,12 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__INVPCID__");
if (HasENQCMD)
Builder.defineMacro("__ENQCMD__");
+ if (HasAMXTILE)
+ Builder.defineMacro("__AMXTILE__");
+ if (HasAMXINT8)
+ Builder.defineMacro("__AMXINT8__");
+ if (HasAMXBF16)
+ Builder.defineMacro("__AMXBF16__");
if (HasSERIALIZE)
Builder.defineMacro("__SERIALIZE__");
if (HasTSXLDTRK)
@@ -1020,6 +1036,9 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("3dnowa", true)
.Case("adx", true)
.Case("aes", true)
+ .Case("amx-bf16", true)
+ .Case("amx-int8", true)
+ .Case("amx-tile", true)
.Case("avx", true)
.Case("avx2", true)
.Case("avx512f", true)
@@ -1102,6 +1121,9 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
return llvm::StringSwitch<bool>(Feature)
.Case("adx", HasADX)
.Case("aes", HasAES)
+ .Case("amx-bf16", HasAMXBF16)
+ .Case("amx-int8", HasAMXINT8)
+ .Case("amx-tile", HasAMXTILE)
.Case("avx", SSELevel >= AVX)
.Case("avx2", SSELevel >= AVX2)
.Case("avx512f", SSELevel >= AVX512F)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index c33c608e27c8..623ac9474b5c 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -125,6 +125,9 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasPTWRITE = false;
bool HasINVPCID = false;
bool HasENQCMD = false;
+ bool HasAMXTILE = false;
+ bool HasAMXINT8 = false;
+ bool HasAMXBF16 = false;
bool HasSERIALIZE = false;
bool HasTSXLDTRK = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index fd9e3a0d672f..e7bee192d918 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -2,6 +2,7 @@ set(files
adxintrin.h
altivec.h
ammintrin.h
+ amxintrin.h
arm_acle.h
arm_cmse.h
armintr.h
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
new file mode 100644
index 000000000000..58254e21c81a
--- /dev/null
+++ b/clang/lib/Headers/amxintrin.h
@@ -0,0 +1,225 @@
+/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXINTRIN_H
+#define __AMXINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
+
+/// Load tile configuration from a 64-byte memory location specified by
+/// "mem_addr". The tile configuration includes the tile type palette, the
+/// number of bytes per row, and the number of rows. If the specified
+/// palette_id is zero, that signifies the init state for both the tile
+/// config and the tile data, and the tiles are zeroed. Any invalid
+/// configurations will result in #GP fault.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
+///
+/// \param __config
+/// A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS
+_tile_loadconfig(const void *__config)
+{
+ __builtin_ia32_tile_loadconfig(__config);
+}
+
+/// Stores the current tile configuration to a 64-byte memory location
+/// specified by "mem_addr". The tile configuration includes the tile type
+/// palette, the number of bytes per row, and the number of rows. If tiles
+/// are not configured, all zeroes will be stored to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
+///
+/// \param __config
+/// A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS
+_tile_storeconfig(void *__config)
+{
+ __builtin_ia32_tile_storeconfig(__config);
+}
+
+/// Release the tile configuration to return to the init state, which
+/// releases all storage it currently holds.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
+static __inline__ void __DEFAULT_FN_ATTRS
+_tile_release(void)
+{
+ __builtin_ia32_tilerelease();
+}
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+#define _tile_loadd(dst, base, stride) \
+ __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+#define _tile_stream_loadd(dst, base, stride) \
+ __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+
+/// Store the tile specified by "src" to memory specifieid by "base" address and
+/// "stride" using the tile configuration previously configured via
+/// "_tile_loadconfig".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be stored in memory.
+#define _tile_stored(dst, base, stride) \
+ __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
+
+/// Zero the tile specified by "tdest".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
+///
+/// \param tile
+/// The destination tile to be zero. Max size is 1024 Bytes.
+#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
+/// "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbf16ps(dst, src0, src1) \
+ __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMXINTRIN_H */
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 6c38b578b30e..2a88c042d046 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -190,6 +190,9 @@
#define bit_TSXLDTRK 0x00010000
#define bit_PCONFIG 0x00040000
#define bit_IBT 0x00100000
+#define bit_AMXBF16 0x00400000
+#define bit_AMXTILE 0x01000000
+#define bit_AMXINT8 0x02000000
/* Features in %eax for leaf 7 sub-leaf 1 */
#define bit_AVX512BF16 0x00000020
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index dd27ca2f6605..e9dff2310fdf 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -471,6 +471,11 @@ _storebe_i64(void * __P, long long __D) {
#include <invpcidintrin.h>
#endif
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
+#include <amxintrin.h>
+#endif
+
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include <avx512vp2intersectintrin.h>
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 0ce84ea382b5..2b52415b2800 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -88,6 +88,7 @@
#include "llvm/Support/SaveAndRestore.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
+#include <bitset>
#include <cassert>
#include <cstddef>
#include <cstdint>
@@ -3607,6 +3608,64 @@ bool Sema::CheckX86BuiltinGatherScatterScale(unsigned BuiltinID,
<< Arg->getSourceRange();
}
+enum { TileRegLow = 0, TileRegHigh = 7 };
+
+bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
+ ArrayRef<int> ArgNums) {
+ for (int ArgNum : ArgNums) {
+ if (SemaBuiltinConstantArgRange(TheCall, ArgNum, TileRegLow, TileRegHigh))
+ return true;
+ }
+ return false;
+}
+
+bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, int ArgNum) {
+ return SemaBuiltinConstantArgRange(TheCall, ArgNum, TileRegLow, TileRegHigh);
+}
+
+bool Sema::CheckX86BuiltinTileDuplicate(CallExpr *TheCall,
+ ArrayRef<int> ArgNums) {
+ // Because the max number of tile register is TileRegHigh + 1, so here we use
+ // each bit to represent the usage of them in bitset.
+ std::bitset<TileRegHigh + 1> ArgValues;
+ for (int ArgNum : ArgNums) {
+ llvm::APSInt Arg;
+ SemaBuiltinConstantArg(TheCall, ArgNum, Arg);
+ int ArgExtValue = Arg.getExtValue();
+ assert((ArgExtValue >= TileRegLow || ArgExtValue <= TileRegHigh) &&
+ "Incorrect tile register num.");
+ if (ArgValues.test(ArgExtValue))
+ return Diag(TheCall->getBeginLoc(),
+ diag::err_x86_builtin_tile_arg_duplicate)
+ << TheCall->getArg(ArgNum)->getSourceRange();
+ ArgValues.set(ArgExtValue);
+ }
+ return false;
+}
+
+bool Sema::CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall,
+ ArrayRef<int> ArgNums) {
+ return CheckX86BuiltinTileArgumentsRange(TheCall, ArgNums) ||
+ CheckX86BuiltinTileDuplicate(TheCall, ArgNums);
+}
+
+bool Sema::CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
+ switch (BuiltinID) {
+ default:
+ return false;
+ case X86::BI__builtin_ia32_tileloadd64:
+ case X86::BI__builtin_ia32_tileloaddt164:
+ case X86::BI__builtin_ia32_tilestored64:
+ case X86::BI__builtin_ia32_tilezero:
+ return CheckX86BuiltinTileArgumentsRange(TheCall, 0);
+ case X86::BI__builtin_ia32_tdpbssd:
+ case X86::BI__builtin_ia32_tdpbsud:
+ case X86::BI__builtin_ia32_tdpbusd:
+ case X86::BI__builtin_ia32_tdpbuud:
+ case X86::BI__builtin_ia32_tdpbf16ps:
+ return CheckX86BuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
+ }
+}
static bool isX86_32Builtin(unsigned BuiltinID) {
// These builtins only work on x86-32 targets.
switch (BuiltinID) {
@@ -3640,6 +3699,10 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
if (CheckX86BuiltinGatherScatterScale(BuiltinID, TheCall))
return true;
+ // If the intrinsic has a tile arguments, make sure they are valid.
+ if (CheckX86BuiltinTileArguments(BuiltinID, TheCall))
+ return true;
+
// For intrinsics which take an immediate value as part of the instruction,
// range check them here.
int i = 0, l = 0, u = 0;
diff --git a/clang/test/CodeGen/AMX/amx.c b/clang/test/CodeGen/AMX/amx.c
new file mode 100644
index 000000000000..89b486f7a601
--- /dev/null
+++ b/clang/test/CodeGen/AMX/amx.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-int8 \
+// RUN: -target-feature +amx-bf16 -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
+
+#include <immintrin.h>
+
+void test_amx(void *data) {
+ //CHECK-LABEL: @test_amx
+ //CHECK: call void @llvm.x86.ldtilecfg(i8* %{{.*}})
+ //CHECK: call void @llvm.x86.sttilecfg(i8* %{{.*}})
+ //CHECK: call void @llvm.x86.tilerelease()
+ //CHECK: call void @llvm.x86.tilezero(i8 3)
+ //CHECK: call void @llvm.x86.tileloadd64(i8 4, i8* %{{.*}}, i64 8)
+ //CHECK: call void @llvm.x86.tileloaddt164(i8 0, i8* %{{.*}}, i64 1)
+ //CHECK: call void @llvm.x86.tilestored64(i8 0, i8* %{{.*}}, i64 1)
+ //CHECK: call void @llvm.x86.tdpbssd(i8 1, i8 2, i8 3)
+ //CHECK: call void @llvm.x86.tdpbsud(i8 1, i8 2, i8 3)
+ //CHECK: call void @llvm.x86.tdpbusd(i8 1, i8 2, i8 3)
+ //CHECK: call void @llvm.x86.tdpbuud(i8 1, i8 2, i8 3)
+ //CHECK: call void @llvm.x86.tdpbf16ps(i8 1, i8 2, i8 3)
+ _tile_loadconfig(data);
+ _tile_storeconfig(data);
+ _tile_release();
+ _tile_zero(3);
+ _tile_loadd(4, data, 8);
+ _tile_stream_loadd(0, data, 1);
+ _tile_stored(0, data, 1);
+ _tile_dpbssd(1, 2, 3);
+ _tile_dpbsud(1, 2, 3);
+ _tile_dpbusd(1, 2, 3);
+ _tile_dpbuud(1, 2, 3);
+ _tile_dpbf16ps(1, 2, 3);
+}
diff --git a/clang/test/CodeGen/AMX/amx_errors.c b/clang/test/CodeGen/AMX/amx_errors.c
new file mode 100644
index 000000000000..13a2b33b5a0a
--- /dev/null
+++ b/clang/test/CodeGen/AMX/amx_errors.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-int8 -target-feature +amx-bf16 -emit-llvm -fsyntax-only -verify
+
+#include <immintrin.h>
+
+void test_amx(void *data) {
+ _tile_zero(16); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+ _tile_loadd(19, data, 16); // expected-error {{argument value 19 is outside the valid range [0, 7]}}
+ _tile_stream_loadd(23, data, 1); // expected-error {{argument value 23 is outside the valid range [0, 7]}}
+ _tile_stored(88, data, 1); // expected-error {{argument value 88 is outside the valid range [0, 7]}}
+ _tile_dpbssd(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+ _tile_dpbssd(0, 16, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+ _tile_dpbuud(0, 2, 16); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+ _tile_dpbsud(1, 1, 3); // expected-error {{tile arguments must refer to
diff erent tiles}}
+ _tile_dpbsud(7, 1, 7); // expected-error {{tile arguments must refer to
diff erent tiles}}
+ _tile_dpbsud(4, 3, 3); // expected-error {{tile arguments must refer to
diff erent tiles}}
+ _tile_dpbf16ps(4, 3, 3); // expected-error {{tile arguments must refer to
diff erent tiles}}
+}
diff --git a/clang/test/CodeGen/AMX/amx_inline_asm.c b/clang/test/CodeGen/AMX/amx_inline_asm.c
new file mode 100644
index 000000000000..9d828f8ac94e
--- /dev/null
+++ b/clang/test/CodeGen/AMX/amx_inline_asm.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-int8 -target-feature +amx-bf16 -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s --check-prefixes=CHECK,X86_64
+
+void f_tilemul(short a)
+{
+ //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdpbf16ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"()
+ __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t"
+ "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t"
+ "tdpbf16ps %%tmm6, %%tmm0, %%tmm7 \n\t"
+ "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t"
+ ::: "memory", "tmm0", "tmm6", "tmm7");
+}
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index b96eed287bd9..817caeecd71e 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -232,3 +232,18 @@
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-tsxldtrk %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-TSXLDTRK %s
// TSXLDTRK: "-target-feature" "+tsxldtrk"
// NO-TSXLDTRK: "-target-feature" "-tsxldtrk"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-tile %s -### -o %t.o 2>&1 | FileCheck --check-prefix=AMX-TILE %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-tile %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AMX-TILE %s
+// AMX-TILE: "-target-feature" "+amx-tile"
+// NO-AMX-TILE: "-target-feature" "-amx-tile"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-bf16 %s -### -o %t.o 2>&1 | FileCheck --check-prefix=AMX-BF16 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-bf16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-BF16 %s
+// AMX-BF16: "-target-feature" "+amx-bf16"
+// NO-AMX-BF16: "-target-feature" "-amx-bf16"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-int8 %s -### -o %t.o 2>&1 | FileCheck --check-prefix=AMX-INT8 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-int8 %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AMX-INT8 %s
+// AMX-INT8: "-target-feature" "+amx-int8"
+// NO-AMX-INT8: "-target-feature" "-amx-int8"
diff --git a/clang/test/Preprocessor/x86_amx_target_features.c b/clang/test/Preprocessor/x86_amx_target_features.c
new file mode 100644
index 000000000000..68a3d7f950b1
--- /dev/null
+++ b/clang/test/Preprocessor/x86_amx_target_features.c
@@ -0,0 +1,35 @@
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-tile -x c -E -dM -o - %s | FileCheck -check-prefix=AMX-TILE %s
+
+// AMX-TILE: #define __AMXTILE__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-bf16 -x c -E -dM -o - %s | FileCheck -check-prefix=AMX-BF16 %s
+
+// AMX-BF16: #define __AMXBF16__ 1
+// AMX-BF16: #define __AMXTILE__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-int8 -x c -E -dM -o - %s | FileCheck -check-prefix=AMX-INT8 %s
+
+// AMX-INT8: #define __AMXINT8__ 1
+// AMX-INT8: #define __AMXTILE__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-tile -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-TILE %s
+
+// NOAMX-TILE-NOT: #define __AMXTILE__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-bf16 -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-BF16 %s
+
+// NOAMX-BF16-NOT: #define __AMXBF16__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -amx-bf16 -mno-amx-tile -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-BF16 %s
+
+// NOAMX-BF16-NOT: #define __AMXTILE__ 1
+// NOAMX-BF16-NOT: #define __AMXBF16__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-int8 -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-INT8 %s
+
+// NOAMX-INT8-NOT: #define __AMXINT8__ 1
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -amx-int8 -mno-amx-tile -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-INT8 %s
+
+// NOAMX-INT8-NOT: #define __AMXTILE__ 1
+// NOAMX-INT8-NOT: #define __AMXINT8__ 1
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index b3bf18720595..3f86fd075d3a 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -4948,3 +4948,32 @@ let TargetPrefix = "x86" in {
def int_x86_xresldtrk : GCCBuiltin<"__builtin_ia32_xresldtrk">,
Intrinsic<[], [], []>;
}
+//===----------------------------------------------------------------------===//
+// AMX - Intel AMX extensions
+
+let TargetPrefix = "x86" in {
+ def int_x86_ldtilecfg : GCCBuiltin<"__builtin_ia32_tile_loadconfig">,
+ Intrinsic<[], [llvm_ptr_ty], []>;
+ def int_x86_sttilecfg : GCCBuiltin<"__builtin_ia32_tile_storeconfig">,
+ Intrinsic<[], [llvm_ptr_ty], []>;
+ def int_x86_tilerelease : GCCBuiltin<"__builtin_ia32_tilerelease">,
+ Intrinsic<[], [], []>;
+ def int_x86_tilezero : GCCBuiltin<"__builtin_ia32_tilezero">,
+ Intrinsic<[], [llvm_i8_ty], []>;
+ def int_x86_tileloadd64 : GCCBuiltin<"__builtin_ia32_tileloadd64">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
+ def int_x86_tileloaddt164 : GCCBuiltin<"__builtin_ia32_tileloaddt164">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
+ def int_x86_tilestored64 : GCCBuiltin<"__builtin_ia32_tilestored64">,
+ Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>;
+ def int_x86_tdpbssd : GCCBuiltin<"__builtin_ia32_tdpbssd">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+ def int_x86_tdpbsud : GCCBuiltin<"__builtin_ia32_tdpbsud">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+ def int_x86_tdpbusd : GCCBuiltin<"__builtin_ia32_tdpbusd">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+ def int_x86_tdpbuud : GCCBuiltin<"__builtin_ia32_tdpbuud">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+ def int_x86_tdpbf16ps : GCCBuiltin<"__builtin_ia32_tdpbf16ps">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>;
+}
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5a57ca7646ff..fb285376c580 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4435,8 +4435,39 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
+ case Intrinsic::x86_tileloadd64:
+ case Intrinsic::x86_tileloaddt164:
+ case Intrinsic::x86_tilestored64: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
+ case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
+ case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
+ }
+ // FIXME: Match displacement and scale.
+ unsigned TIndex = Node->getConstantOperandVal(2);
+ SDValue TReg = getI8Imm(TIndex, dl);
+ SDValue Base = Node->getOperand(3);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(4);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ if (Opc == X86::PTILESTORED) {
+ SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ } else {
+ SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ }
+ ReplaceNode(Node, CNode);
+ return;
+ }
}
-
break;
}
case ISD::BRIND: {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 88a563720c2a..d7a45f6fb7c4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -33044,6 +33044,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
+ auto TMMImmToTMMReg = [](unsigned Imm) {
+ assert (Imm < 8 && "Illegal tmm index");
+ return X86::TMM0 + Imm;
+ };
switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TLS_addr32:
@@ -33326,6 +33330,67 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return BB;
}
+ case X86::PTDPBSSD:
+ case X86::PTDPBSUD:
+ case X86::PTDPBUSD:
+ case X86::PTDPBUUD:
+ case X86::PTDPBF16PS: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
+ case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
+ case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
+ case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
+ case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ case X86::PTILEZERO: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Imm = MI.getOperand(0).getImm();
+ BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ case X86::PTILELOADD:
+ case X86::PTILELOADDT1:
+ case X86::PTILESTORED: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::PTILELOADD: Opc = X86::TILELOADD; break;
+ case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
+ case X86::PTILESTORED: Opc = X86::TILESTORED; break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ unsigned CurOp = 0;
+ if (Opc != X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Define);
+
+ MIB.add(MI.getOperand(CurOp++)); // base
+ MIB.add(MI.getOperand(CurOp++)); // scale
+ MIB.add(MI.getOperand(CurOp++)); // index -- stride
+ MIB.add(MI.getOperand(CurOp++)); // displacement
+ MIB.add(MI.getOperand(CurOp++)); // segment
+
+ if (Opc == X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Undef);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
}
}
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index deefb3eecf39..e26dd5050a23 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -18,9 +18,11 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
let SchedRW = [WriteSystem] in {
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
- "ldtilecfg\t$src", []>, VEX, T8PS;
+ "ldtilecfg\t$src",
+ [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
- "sttilecfg\t$src", []>, VEX, T8PD;
+ "sttilecfg\t$src",
+ [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src),
"tileloadd\t{$src, $dst|$dst, $src}", []>,
@@ -31,7 +33,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
VEX, T8PD;
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
- "tilerelease", []>, VEX, T8PS;
+ "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
(ins sibmem:$dst, TILE:$src),
"tilestored\t{$src, $dst|$dst, $src}", []>,
@@ -39,6 +41,17 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins),
"tilezero\t$dst", []>,
VEX, T8XD;
+
+ let usesCustomInserter = 1 in {
+ // Pseudo instructions, using immediates instead of tile registers.
+ // To be translated to the actual instructions in X86ISelLowering.cpp
+ def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+ def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
+ sibmem:$src2), []>;
+ def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
+ def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
+ [(int_x86_tilezero imm:$src)]>;
+ }
} // SchedRW
} // HasAMXTILE
@@ -62,6 +75,27 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
"tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
VEX_4V, T8PS;
}
+
+ let usesCustomInserter = 1 in {
+ // Pseudo instructions, using immediates instead of tile registers.
+ // To be translated to the actual instructions in X86ISelLowering.cpp
+ def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbssd imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbsud imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbusd imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbuud imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ }
}
} // HasAMXTILE
@@ -72,5 +106,14 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
(ins TILE:$src1, TILE:$src2, TILE:$src3),
"tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[]>, VEX_4V, T8XS;
+
+ let usesCustomInserter = 1 in {
+ // Pseudo instructions, using immediates instead of tile registers.
+ // To be translated to the actual instructions in X86ISelLowering.cpp
+ def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbf16ps imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ }
}
} // HasAMXTILE, HasAMXBF16
diff --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
new file mode 100644
index 000000000000..a415d9c15242
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+amx-bf16 -verify-machineinstrs | FileCheck %s
+
+define void @test_amx() {
+; CHECK-LABEL: test_amx:
+; CHECK: # %bb.0:
+; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3
+; CHECK-NEXT: retq
+ call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
+ ret void
+}
+
+declare void @llvm.x86.tdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll
new file mode 100644
index 000000000000..49e69aeab510
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s
+
+define void @test_amx() {
+; CHECK-LABEL: test_amx:
+; CHECK: # %bb.0:
+ call void @llvm.x86.tdpbssd(i8 3, i8 4, i8 7)
+; CHECK-NEXT: tdpbssd %tmm7, %tmm4, %tmm3
+
+ call void @llvm.x86.tdpbsud(i8 3, i8 4, i8 7)
+; CHECK-NEXT: tdpbsud %tmm7, %tmm4, %tmm3
+
+ call void @llvm.x86.tdpbusd(i8 3, i8 0, i8 7)
+; CHECK-NEXT: tdpbusd %tmm7, %tmm0, %tmm3
+
+ call void @llvm.x86.tdpbuud(i8 3, i8 4, i8 1)
+; CHECK-NEXT: tdpbuud %tmm1, %tmm4, %tmm3
+ ret void
+}
+
+declare void @llvm.x86.tdpbssd(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.tdpbsud(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.tdpbusd(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.tdpbuud(i8 %tile0, i8 %tile1, i8 %tile2)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll
new file mode 100644
index 000000000000..6b8e040abb9a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -verify-machineinstrs | FileCheck %s
+
+define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
+; CHECK-LABEL: test_amx:
+; CHECK: # %bb.0:
+ call void @llvm.x86.ldtilecfg(i8* %pointer)
+; CHECK-NEXT: ldtilecfg (%rdi)
+
+ call void @llvm.x86.sttilecfg(i8* %pointer)
+; CHECK-NEXT: sttilecfg (%rdi)
+
+ call void @llvm.x86.tilerelease()
+; CHECK-NEXT: tilerelease
+
+ call void @llvm.x86.tilezero(i8 3)
+; CHECK-NEXT: tilezero %tmm3
+
+ call void @llvm.x86.tileloadd64(i8 3, i8* %base, i64 %stride)
+; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm3
+
+ call void @llvm.x86.tileloaddt164(i8 3, i8* %base, i64 %stride)
+; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm3
+
+ call void @llvm.x86.tilestored64(i8 3, i8* %base, i64 %stride)
+; CHECK-NEXT: tilestored %tmm3, (%rsi,%rdx)
+ ret void
+}
+
+declare void @llvm.x86.tileloadd64(i8 %tile, i8* %base, i64 %stride)
+declare void @llvm.x86.tileloaddt164(i8 %tile, i8* %base, i64 %stride)
+declare void @llvm.x86.tilestored64(i8 %tile, i8* %base, i64 %stride)
+declare void @llvm.x86.ldtilecfg(i8* %pointer)
+declare void @llvm.x86.sttilecfg(i8* %pointer)
+declare void @llvm.x86.tilerelease()
+declare void @llvm.x86.tilezero(i8 %tile)
More information about the cfe-commits
mailing list