[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Oct 29 08:15:01 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Phoebe Wang (phoebewang)
<details>
<summary>Changes</summary>
---
Patch is 81.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114070.diff
31 Files Affected:
- (modified) clang/docs/ReleaseNotes.rst (+2)
- (modified) clang/include/clang/Basic/BuiltinsX86_64.def (+13)
- (modified) clang/include/clang/Driver/Options.td (+2)
- (modified) clang/lib/Basic/Targets/X86.cpp (+6)
- (modified) clang/lib/Basic/Targets/X86.h (+1)
- (modified) clang/lib/Headers/CMakeLists.txt (+1)
- (added) clang/lib/Headers/amxavx512intrin.h (+381)
- (modified) clang/lib/Headers/immintrin.h (+4)
- (modified) clang/lib/Sema/SemaX86.cpp (+6)
- (added) clang/test/CodeGen/X86/amx_avx512_api.c (+52)
- (added) clang/test/CodeGen/X86/amxavx512-builtins.c (+41)
- (modified) clang/test/CodeGen/attr-target-x86.c (+4-4)
- (modified) clang/test/Driver/x86-target-features.c (+7)
- (modified) clang/test/Preprocessor/x86_target_features.c (+7)
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+50)
- (modified) llvm/include/llvm/TargetParser/X86TargetParser.def (+1)
- (modified) llvm/lib/Target/X86/X86.td (+4)
- (modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+60-4)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+76)
- (modified) llvm/lib/Target/X86/X86InstrAMX.td (+147)
- (modified) llvm/lib/Target/X86/X86InstrPredicates.td (+1)
- (modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+11)
- (modified) llvm/lib/Target/X86/X86PreTileConfig.cpp (+17-2)
- (modified) llvm/lib/TargetParser/Host.cpp (+4)
- (modified) llvm/lib/TargetParser/X86TargetParser.cpp (+2)
- (added) llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll (+171)
- (added) llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll (+116)
- (added) llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll (+61)
- (added) llvm/test/MC/Disassembler/X86/amx-avx512.txt (+106)
- (added) llvm/test/MC/X86/amx-avx512-att.s (+105)
- (added) llvm/test/MC/X86/amx-avx512-intel.s (+105)
``````````diff
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ce046a305c89b6..d45bd1240dd173 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -611,6 +611,8 @@ X86 Support
* Supported MINMAX intrinsics of ``*_(mask(z)))_minmax(ne)_p[s|d|h|bh]`` and
``*_(mask(z)))_minmax_s[s|d|h]``.
+- Support ISA of ``AMX-AVX512``.
+
- All intrinsics in adcintrin.h can now be used in constant expressions.
- All intrinsics in adxintrin.h can now be used in constant expressions.
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 2c591edb2835cd..70644f3f6b6054 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -128,6 +128,12 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i",
TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512")
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -148,6 +154,13 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l, "V32yIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512")
+TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512")
+
TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2ddb2f5312148e..fd200abebceb11 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6277,6 +6277,8 @@ def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
def mno_fp_ret_in_387 : Flag<["-"], "mno-fp-ret-in-387">, Alias<mno_x87>;
def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>;
def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>;
+def mamx_avx512 : Flag<["-"], "mamx-avx512">, Group<m_x86_Features_Group>;
+def mno_amx_avx512 : Flag<["-"], "mno-amx-avx512">, Group<m_x86_Features_Group>;
def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>;
def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>;
def mamx_complex : Flag<["-"], "mamx-complex">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 5448bd841959f4..52cab65cbd9451 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -418,6 +418,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXTILE = true;
} else if (Feature == "+amx-complex") {
HasAMXCOMPLEX = true;
+ } else if (Feature == "+amx-avx512") {
+ HasAMXAVX512 = true;
} else if (Feature == "+cmpccxadd") {
HasCMPCCXADD = true;
} else if (Feature == "+raoint") {
@@ -935,6 +937,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_FP16__");
if (HasAMXCOMPLEX)
Builder.defineMacro("__AMX_COMPLEX__");
+ if (HasAMXAVX512)
+ Builder.defineMacro("__AMX_AVX512__");
if (HasCMPCCXADD)
Builder.defineMacro("__CMPCCXADD__");
if (HasRAOINT)
@@ -1060,6 +1064,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
return llvm::StringSwitch<bool>(Name)
.Case("adx", true)
.Case("aes", true)
+ .Case("amx-avx512", true)
.Case("amx-bf16", true)
.Case("amx-complex", true)
.Case("amx-fp16", true)
@@ -1177,6 +1182,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
return llvm::StringSwitch<bool>(Feature)
.Case("adx", HasADX)
.Case("aes", HasAES)
+ .Case("amx-avx512", HasAMXAVX512)
.Case("amx-bf16", HasAMXBF16)
.Case("amx-complex", HasAMXCOMPLEX)
.Case("amx-fp16", HasAMXFP16)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index a99ae62984c7d5..ce7458ae99ad64 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -156,6 +156,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXINT8 = false;
bool HasAMXBF16 = false;
bool HasAMXCOMPLEX = false;
+ bool HasAMXAVX512 = false;
bool HasSERIALIZE = false;
bool HasTSXLDTRK = false;
bool HasUSERMSR = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index ff392e7122a448..88e8f282bd7ec2 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -146,6 +146,7 @@ set(x86_files
adcintrin.h
adxintrin.h
ammintrin.h
+ amxavx512intrin.h
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h
new file mode 100644
index 00000000000000..f819696f8086b7
--- /dev/null
+++ b/clang/lib/Headers/amxavx512intrin.h
@@ -0,0 +1,381 @@
+/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_AVX512INTRIN_H
+#define __AMX_AVX512INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_AVX512 \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-avx512")))
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the int32 source elements to fp32. The row of the tile is selected by an
+/// 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
+///
+/// \param tsrc
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param row
+/// The row of the source tile
+#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to bf16. It places the resulting bf16 elements
+/// in the high 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2pbf16h(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+0] := 0
+/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PBF16H instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2pbf16h(tsrc, row) \
+ __builtin_ia32_tcvtrowps2pbf16h(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to bf16. It places the resulting bf16 elements
+/// in the low 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2pbf16l(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+1] := 0
+/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PBF16L instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2pbf16l(tsrc, row) \
+ __builtin_ia32_tcvtrowps2pbf16l(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to fp16. It places the resulting fp16 elements
+/// in the high 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+0] := 0
+/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+/// the fp32 source elements to fp16. It places the resulting fp16 elements
+/// in the low 16 bits within each dword. The row of the tile is selected
+/// by an 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+/// IF i + row_chunk / 4 >= tsrc.colsb / 4
+/// dst.dword[i] := 0
+/// ELSE
+/// dst.word[2*i+1] := 0
+/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+/// FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
+///
+/// \param tsrc
+/// The source tile. Max size is 1024 Bytes.
+/// \param row
+/// The the row of the source tile.
+#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
+
+/// Move one row of a tile data to a v16f32 data.
+/// The row of the tile is selected by a 32b GPR.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m512 _tile_movrow(__tile a, unsigned b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
+///
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v16f32 data. Size is 64 Bytes.
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL>>3
+/// row_index := b&0xffff
+/// row_chunk := ((b>>16)&0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes-1)
+/// IF (row_chunk + i >= a.colsb)
+/// dst.byte[i] := 0
+/// ELSE
+/// dst.byte[i] := a.row[row_index].byte[row_chunk+i]
+/// ENDFOR
+/// \endcode
+#define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+_tile_cvtrowps2pbf16h_internal(unsigned short m, unsigned short n,
+ _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2pbf16h_internal(m, n, src, u);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+_tile_cvtrowps2pbf16l_internal(unsigned short m, unsigned short n,
+ _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2pbf16l_internal(m, n, src, u);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
+ unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+ return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
+}
+
+/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
+/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
+/// MXCSR.RC=RNE. Embedded rounding is not supported.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v16f32 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
+/// elements to bf16 at high 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16H </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32bf16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512bh __tile_cvtrowps2pbf16h(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2pbf16h_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
+/// elements to bf16 at low 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PBF16L </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32bf16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512bh __tile_cvtrowps2pbf16l(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2pbf16l_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
+/// elements to fp16 at high 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32fp16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
+/// elements to fp16 at low 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v32fp16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
+ return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move one row of a tile data to a v16f32 data.
+/// The row of the tile is selected by a 32b GPR.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
+///
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 1st source r32. Size is 4 Bytes.
+/// \returns
+/// The destination v16i32 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
+ return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+#endif // __x86_64__
+#endif // __AMX_AVX512INTRIN_H
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 3fbabffa98df20..84e56238fcf2dc 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -638,6 +638,10 @@ _storebe_i64(void * __P, long long __D) {
#include <amxcomplexintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
+#include <amxavx512intrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include <avx512vp2intersectintrin.h>
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 6a4d78f0ca9084..fba901473e6e18 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -631,6 +631,12 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCa...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/114070
More information about the cfe-commits
mailing list