[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)
Phoebe Wang via cfe-commits
cfe-commits at lists.llvm.org
Sun Nov 10 06:48:56 PST 2024
https://github.com/phoebewang created https://github.com/llvm/llvm-project/pull/115660
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
>From f2fc493149d75f0be13207bc1893a48c7fab84a3 Mon Sep 17 00:00:00 2001
From: "Wang, Phoebe" <phoebe.wang at intel.com>
Date: Sun, 10 Nov 2024 22:37:15 +0800
Subject: [PATCH] [X86][AMX] Support AMX-TRANSPOSE, part 2
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
clang/include/clang/Basic/BuiltinsX86_64.def | 12 +
clang/lib/Headers/CMakeLists.txt | 3 +
clang/lib/Headers/amxbf16transposeintrin.h | 94 ++++++
clang/lib/Headers/amxcomplextransposeintrin.h | 301 ++++++++++++++++++
clang/lib/Headers/amxfp16intrin.h | 35 ++
clang/lib/Headers/amxfp16transposeintrin.h | 94 ++++++
clang/lib/Headers/amxintrin.h | 32 --
clang/lib/Headers/immintrin.h | 22 +-
clang/lib/Sema/SemaX86.cpp | 6 +
clang/test/CodeGen/X86/amx_transpose.c | 39 +++
clang/test/CodeGen/X86/amx_transpose_api.c | 50 ++-
clang/test/CodeGen/X86/amx_transpose_errors.c | 50 ++-
llvm/include/llvm/IR/IntrinsicsX86.td | 57 ++++
llvm/lib/Target/X86/X86ExpandPseudo.cpp | 28 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 58 ++--
llvm/lib/Target/X86/X86InstrAMX.td | 89 ++++++
llvm/lib/Target/X86/X86LowerAMXType.cpp | 24 +-
llvm/lib/Target/X86/X86RegisterInfo.cpp | 8 +-
.../CodeGen/X86/amx_transpose_intrinsics.ll | 77 ++++-
.../MC/Disassembler/X86/amx-transpose-att.txt | 48 +++
llvm/test/MC/X86/amx-transpose-att.s | 48 +++
llvm/test/MC/X86/amx-transpose-intel.s | 48 +++
22 files changed, 1154 insertions(+), 69 deletions(-)
create mode 100644 clang/lib/Headers/amxbf16transposeintrin.h
create mode 100644 clang/lib/Headers/amxcomplextransposeintrin.h
create mode 100644 clang/lib/Headers/amxfp16transposeintrin.h
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..cc8637ed9c50da 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z",
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16_internal, "V256iUsUsV256i", "n", "amx-complex,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
@@ -164,6 +170,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16, "vIUcIUc", "n", "amx-complex,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512")
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 76366ca1f108e9..19013d37f46ef7 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,8 +147,11 @@ set(x86_files
adxintrin.h
ammintrin.h
amxavx512intrin.h
+ amxbf16transposeintrin.h
amxcomplexintrin.h
+ amxcomplextransposeintrin.h
amxfp16intrin.h
+ amxfp16transposeintrin.h
amxfp8intrin.h
amxintrin.h
amxtransposeintrin.h
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
new file mode 100644
index 00000000000000..7d31384e317988
--- /dev/null
+++ b/clang/lib/Headers/amxbf16transposeintrin.h
@@ -0,0 +1,94 @@
+/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_BF16TRANSPOSEINTRIN_H
+#define __AMX_BF16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-bf16,amx-transpose")))
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+/// tiles \a a and \a b, accumulating the intermediate single-precision
+/// (32-bit) floating-point elements with elements in \a dst, and store the
+/// 32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO (a.colsb / 4) - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
+/// FP32(b.row[k].bf16[2*n+0])
+/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
+/// FP32(b.row[k].bf16[2*n+1])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+/// tiles src0 and src1, accumulating the intermediate single-precision
+/// (32-bit) floating-point elements with elements in "dst", and store the
+/// 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
new file mode 100644
index 00000000000000..06fb53e4deadcd
--- /dev/null
+++ b/clang/lib/Headers/amxcomplextransposeintrin.h
@@ -0,0 +1,301 @@
+/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles \a a and \a b is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+/// of (transposed column of \a a, column of \a b), it performs a set of
+/// multiplication and accumulations on all corresponding complex numbers
+/// (one from \a a and one from \a b). The imaginary part of the \a a element
+/// is multiplied with the real part of the corresponding \a b element, and
+/// the real part of the \a a element is multiplied with the imaginary part
+/// of the corresponding \a b elements. The two accumulated results are
+/// added, and then accumulated into the corresponding row and column of
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO a.rows - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles \a a and \a b is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+/// of (rtransposed colum of \a a, column of \a b), it performs a set of
+/// multiplication and accumulations on all corresponding complex numbers
+/// (one from \a a and one from \a b). The real part of the \a a element is
+/// multiplied with the real part of the corresponding \a b element, and the
+/// negated imaginary part of the \a a element is multiplied with the
+/// imaginary part of the corresponding \a b elements. The two accumulated
+/// results are added, and then accumulated into the corresponding row and
+/// column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO a.rows - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+/// containing complex elements and accumulate the results into a packed
+/// single precision tile. Each dword element in input tiles \a a and \a b
+/// is interpreted as a complex number with FP16 real part and FP16 imaginary
+/// part.
+/// Calculates the imaginary part of the result. For each possible combination
+/// of (transposed column of \a a, column of \a b), it performs a set of
+/// multiplication and accumulations on all corresponding complex numbers
+/// (one from \a a and one from \a b). The negated imaginary part of the \a a
+/// element is multiplied with the real part of the corresponding \a b
+/// element, and the real part of the \a a element is multiplied with the
+/// imaginary part of the corresponding \a b elements. The two accumulated
+/// results are added, and then accumulated into the corresponding row and
+/// column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO a.rows - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_conjtcmmimfp16ps(dst, a, b) \
+ __builtin_ia32_tconjtcmmimfp16ps(dst, a, b)
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
+/// and writes the result to \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtfp16(__tile dst, __tile a);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR i := 0 TO dst.rows - 1
+/// FOR j := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
+/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
+/// ENDFOR
+/// write_row_and_zero(dst, i, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The source tile. Max size is 1024 Bytes.
+#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16(dst, a)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
+ unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+ _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
+ unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+ _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
+ unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+ _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtfp16_internal(
+ unsigned short m, unsigned short n, _tile1024i src) {
+ return __builtin_ia32_tconjtfp16_internal(m, n, src);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+ dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+ dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+/// containing complex elements and accumulate the results into a packed
+/// single precision tile. Each dword element in input tiles src0 and src1
+/// is interpreted as a complex number with FP16 real part and FP16 imaginary
+/// part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+ dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from src and
+/// writes the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// The source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
+ dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxfp16intrin.h b/clang/lib/Headers/amxfp16intrin.h
index ed798245d41efb..bb4bc31fdafd50 100644
--- a/clang/lib/Headers/amxfp16intrin.h
+++ b/clang/lib/Headers/amxfp16intrin.h
@@ -15,6 +15,10 @@
#define __AMX_FP16INTRIN_H
#ifdef __x86_64__
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
+
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
/// and \a b, accumulating the intermediate single-precision (32-bit)
/// floating-point elements with elements in \a dst, and store the 32-bit
@@ -54,5 +58,36 @@
#define _tile_dpfp16ps(dst, a, b) \
__builtin_ia32_tdpfp16ps(dst, a, b)
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
#endif /* __x86_64__ */
#endif /* __AMX_FP16INTRIN_H */
diff --git a/clang/lib/Headers/amxfp16transposeintrin.h b/clang/lib/Headers/amxfp16transposeintrin.h
new file mode 100644
index 00000000000000..c07c5516301983
--- /dev/null
+++ b/clang/lib/Headers/amxfp16transposeintrin.h
@@ -0,0 +1,94 @@
+/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <amxfp16transposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_FP16TRANSPOSEINTRIN_H
+#define __AMX_FP16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-fp16,amx-transpose")))
+
+/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
+/// tiles \a a and \a b, accumulating the intermediate single-precision
+/// (32-bit) floating-point elements with elements in \a dst, and store the
+/// 32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO (a.colsb / 4) - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+/// FP32(b.row[k].fp16[2*n+0])
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
+/// FP32(b.row[k].fp16[2*n+1])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps(dst, a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
+/// tiles src0 and src1, accumulating the intermediate single-precision
+/// (32-bit) floating-point elements with elements in "dst", and store the
+/// 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTDPFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMX_FP16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index f07a5689011853..b0140615677f27 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -22,8 +22,6 @@
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
#define __DEFAULT_FN_ATTRS_BF16 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
-#define __DEFAULT_FN_ATTRS_FP16 \
- __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
@@ -294,13 +292,6 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
}
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
-_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
- return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
/// This struct pack the shape and tile data together for user. We suggest
/// initializing the struct as early as possible, because compiler depends
/// on the shape information to do configure. The constant value is preferred
@@ -495,32 +486,9 @@ static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
src0.tile, src1.tile);
}
-/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_FP16
-static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
- dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
- src0.tile, src1.tile);
-}
-
#undef __DEFAULT_FN_ATTRS_TILE
#undef __DEFAULT_FN_ATTRS_INT8
#undef __DEFAULT_FN_ATTRS_BF16
-#undef __DEFAULT_FN_ATTRS_FP16
#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index bc240e28d59142..a6945406dc6610 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -630,9 +630,6 @@ _storebe_i64(void * __P, long long __D) {
#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
#include <invpcidintrin.h>
#endif
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
-#include <amxfp16intrin.h>
-#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) || \
defined(__WIDEKL__)
@@ -644,6 +641,10 @@ _storebe_i64(void * __P, long long __D) {
#include <amxintrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
+#include <amxfp16intrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
#include <amxcomplexintrin.h>
#endif
@@ -660,6 +661,21 @@ _storebe_i64(void * __P, long long __D) {
#include <amxavx512intrin.h>
#endif
+#if !defined(__SCE__) || __has_feature(modules) || \
+ (defined(__AMX_BF16__) && defined(__AMX_TRANSPOSE__))
+#include <amxbf16transposeintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || \
+ (defined(__AMX_FP16__) && defined(__AMX_TRANSPOSE__))
+#include <amxfp16transposeintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || \
+ (defined(__AMX_COMPLEX__) && defined(__AMX_TRANSPOSE__))
+#include <amxcomplextransposeintrin.h>
+#endif
+
#if !defined(__SCE__) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include <avx512vp2intersectintrin.h>
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 1155a5edc73c34..4290f48d497290 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -654,8 +654,14 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_tdpbhf8ps:
case X86::BI__builtin_ia32_tdphbf8ps:
case X86::BI__builtin_ia32_tdphf8ps:
+ case X86::BI__builtin_ia32_ttdpbf16ps:
+ case X86::BI__builtin_ia32_ttdpfp16ps:
+ case X86::BI__builtin_ia32_ttcmmimfp16ps:
+ case X86::BI__builtin_ia32_ttcmmrlfp16ps:
+ case X86::BI__builtin_ia32_tconjtcmmimfp16ps:
return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
case X86::BI__builtin_ia32_ttransposed:
+ case X86::BI__builtin_ia32_tconjtfp16:
return CheckBuiltinTileArgumentsRange(TheCall, {0, 1});
}
}
diff --git a/clang/test/CodeGen/X86/amx_transpose.c b/clang/test/CodeGen/X86/amx_transpose.c
index deefc592c7ae66..7e88fd80592d62 100644
--- a/clang/test/CodeGen/X86/amx_transpose.c
+++ b/clang/test/CodeGen/X86/amx_transpose.c
@@ -1,4 +1,5 @@
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-transpose \
+// RUN: -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \
// RUN: -target-feature +avx512f -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression| FileCheck %s
#include <immintrin.h>
@@ -34,3 +35,41 @@ void test_tile_transposed(void)
// CHECK: call void @llvm.x86.ttransposed(i8 1, i8 2)
_tile_transposed(1, 2);
}
+
+void test_tile_tdpbf16ps(void)
+{
+ // CHECK-LABEL: @test_tile_tdpbf16ps
+ // CHECK: call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
+ _tile_tdpbf16ps(1, 2, 3);
+}
+
+void test_tile_tdpfp16ps(void)
+{
+ // CHECK-LABEL: @test_tile_tdpfp16ps
+ // CHECK: call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
+ _tile_tdpfp16ps(4, 5, 6);
+}
+
+void test_tile_tcmmimfp16ps(void) {
+ // CHECK-LABEL: @test_tile_tcmmimfp16ps
+ // CHECK: call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
+ _tile_tcmmimfp16ps(1, 2, 3);
+}
+
+void test_tile_tcmmrlfp16ps(void) {
+ // CHECK-LABEL: @test_tile_tcmmrlfp16ps
+ // CHECK: call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
+ _tile_tcmmrlfp16ps(1, 2, 3);
+}
+
+void test_tile_conjtcmmimfp16ps(void) {
+ // CHECK-LABEL: @test_tile_conjtcmmimfp16ps
+ // CHECK: call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
+ _tile_conjtcmmimfp16ps(1, 2, 3);
+}
+
+void test_tile_conjtfp16(void) {
+ // CHECK-LABEL: @test_tile_conjtfp16
+ // CHECK: call void @llvm.x86.tconjtfp16(i8 1, i8 2)
+ _tile_conjtfp16(1, 2);
+}
diff --git a/clang/test/CodeGen/X86/amx_transpose_api.c b/clang/test/CodeGen/X86/amx_transpose_api.c
index 10310c2332b7a6..dc3ef5104252ca 100644
--- a/clang/test/CodeGen/X86/amx_transpose_api.c
+++ b/clang/test/CodeGen/X86/amx_transpose_api.c
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f \
-// RUN: -target-feature +amx-transpose \
+// RUN: -target-feature +amx-transpose -target-feature +amx-bf16 -target-feature +amx-fp16 -target-feature +amx-complex \
// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
#include <immintrin.h>
@@ -64,3 +64,51 @@ void test_tile_transposed(__tile1024i dst, __tile1024i src) {
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
__tile_transposed(&dst, src);
}
+
+void test_tile_tdpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+ //CHECK-LABEL: @test_tile_tdpbf16ps
+ //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+ //CHECK-DAG: call x86_amx @llvm.x86.ttdpbf16ps.internal
+ //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ __tile_tdpbf16ps(&c, a, b);
+}
+
+void test_tile_tdpfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+ //CHECK-LABEL: @test_tile_tdpfp16ps
+ //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+ //CHECK-DAG: call x86_amx @llvm.x86.ttdpfp16ps.internal
+ //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ __tile_tdpfp16ps(&c, a, b);
+}
+
+void test_tile_tcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+ //CHECK-LABEL: @test_tile_tcmmimfp16ps
+ //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+ //CHECK-DAG: call x86_amx @llvm.x86.ttcmmimfp16ps.internal
+ //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ __tile_tcmmimfp16ps(&c, a, b);
+}
+
+void test_tile_tcmmrlfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+ //CHECK-LABEL: @test_tile_tcmmrlfp16ps
+ //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+ //CHECK-DAG: call x86_amx @llvm.x86.ttcmmrlfp16ps.internal
+ //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ __tile_tcmmrlfp16ps(&c, a, b);
+}
+
+void test_tile_conjtcmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+ //CHECK-LABEL: @test_tile_conjtcmmimfp16ps
+ //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+ //CHECK-DAG: call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal
+ //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ __tile_conjtcmmimfp16ps(&c, a, b);
+}
+
+void test_tile_conjtfp16(__tile1024i dst, __tile1024i src) {
+ //CHECK-LABEL: @test_tile_conjtfp16
+ //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+ //CHECK-DAG: call x86_amx @llvm.x86.tconjtfp16.internal
+ //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+ __tile_conjtfp16(&dst, src);
+}
diff --git a/clang/test/CodeGen/X86/amx_transpose_errors.c b/clang/test/CodeGen/X86/amx_transpose_errors.c
index 80084c42a240dd..80368c580c793a 100644
--- a/clang/test/CodeGen/X86/amx_transpose_errors.c
+++ b/clang/test/CodeGen/X86/amx_transpose_errors.c
@@ -1,9 +1,7 @@
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
// RUN: -target-feature +amx-int8 -target-feature +amx-bf16 -target-feature +amx-transpose \
-// RUN: -target-feature +avx512f -target-feature +amx-element-evex -verify
+// RUN: -target-feature +avx512f -target-feature +amx-fp16 -target-feature +amx-complex -verify
-#include <immintrin.h>
-#include <stddef.h>
#include <immintrin.h>
#include <stddef.h>
@@ -24,8 +22,54 @@ void test_tile_2rpntlvwz1t1(const void *A, size_t B) {
_tile_2rpntlvwz1t1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
}
+void test_tile_tdpbf16ps()
+{
+ _tile_tdpbf16ps(8, 2, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ _tile_tdpbf16ps(1, 8, 3); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ _tile_tdpbf16ps(1, 2, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ _tile_tdpbf16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}}
+ _tile_tdpbf16ps(1, 2, 1); // expected-error {{tile arguments must refer to different tiles}}
+ _tile_tdpbf16ps(1, 2, 2); // expected-error {{tile arguments must refer to different tiles}}
+}
+
+void test_tile_tdpfp16ps()
+{
+ _tile_tdpfp16ps(8, 5, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ _tile_tdpfp16ps(1, 8, 6); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ _tile_tdpfp16ps(1, 5, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ _tile_tdpfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}}
+ _tile_tdpfp16ps(1, 2, 1); // expected-error {{tile arguments must refer to different tiles}}
+ _tile_tdpfp16ps(1, 2, 2); // expected-error {{tile arguments must refer to different tiles}}
+}
+
void test_tile_transposed()
{
_tile_transposed(8, 2); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
_tile_transposed(1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
}
+
+void test_tile_tcmmimfp16ps() {
+ _tile_tcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+ _tile_tcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
+ _tile_tcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
+ _tile_tcmmimfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}}
+}
+
+void test_tile_tcmmrlfp16ps() {
+ _tile_tcmmrlfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+ _tile_tcmmrlfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
+ _tile_tcmmrlfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
+ _tile_tcmmrlfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}}
+}
+
+void test_tile_conjtcmmimfp16ps() {
+ _tile_conjtcmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+ _tile_conjtcmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
+ _tile_conjtcmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}}
+ _tile_conjtcmmimfp16ps(1, 2, 1); // expected-error {{tile arguments must refer to different tiles}}
+}
+
+void test_tile_conjtfp16() {
+ _tile_conjtfp16(16, 2); // expected-error {{argument value 16 is outside the valid range [0, 7]}}
+ _tile_conjtfp16(1, 26); // expected-error {{argument value 26 is outside the valid range [0, 7]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 3003f9887e239c..21d4ece9d92f63 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5951,6 +5951,29 @@ let TargetPrefix = "x86" in {
def int_x86_ttransposed : ClangBuiltin<"__builtin_ia32_ttransposed">,
Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+ def int_x86_ttdpbf16ps : ClangBuiltin<"__builtin_ia32_ttdpbf16ps">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>]>;
+ def int_x86_ttdpfp16ps : ClangBuiltin<"__builtin_ia32_ttdpfp16ps">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>]>;
+ def int_x86_ttcmmimfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>]>;
+ def int_x86_ttcmmrlfp16ps : ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>]>;
+ def int_x86_tconjtcmmimfp16ps : ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>]>;
+ def int_x86_tconjtfp16 : ClangBuiltin<"__builtin_ia32_tconjtfp16">,
+ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
// AMX-AVX512
def int_x86_tcvtrowd2ps : ClangBuiltin<"__builtin_ia32_tcvtrowd2ps">,
@@ -6070,6 +6093,40 @@ let TargetPrefix = "x86" in {
ClangBuiltin<"__builtin_ia32_ttransposed_internal">,
Intrinsic<[llvm_x86amx_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>;
+ def int_x86_ttdpbf16ps_internal :
+ ClangBuiltin<"__builtin_ia32_ttdpbf16ps_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+ llvm_x86amx_ty, llvm_x86amx_ty,
+ llvm_x86amx_ty], []>;
+ def int_x86_ttdpfp16ps_internal :
+ ClangBuiltin<"__builtin_ia32_ttdpfp16ps_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+ llvm_x86amx_ty, llvm_x86amx_ty,
+ llvm_x86amx_ty], []>;
+ def int_x86_ttcmmimfp16ps_internal :
+ ClangBuiltin<"__builtin_ia32_ttcmmimfp16ps_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+ llvm_x86amx_ty, llvm_x86amx_ty,
+ llvm_x86amx_ty], []>;
+ def int_x86_ttcmmrlfp16ps_internal :
+ ClangBuiltin<"__builtin_ia32_ttcmmrlfp16ps_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+ llvm_x86amx_ty, llvm_x86amx_ty,
+ llvm_x86amx_ty], []>;
+ def int_x86_tconjtcmmimfp16ps_internal :
+ ClangBuiltin<"__builtin_ia32_tconjtcmmimfp16ps_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+ llvm_x86amx_ty, llvm_x86amx_ty,
+ llvm_x86amx_ty], []>;
+ def int_x86_tconjtfp16_internal :
+ ClangBuiltin<"__builtin_ia32_tconjtfp16_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty], []>;
def int_x86_tcvtrowd2ps_internal :
ClangBuiltin<"__builtin_ia32_tcvtrowd2ps_internal">,
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index a6096e5032e89c..f68291f28afeb4 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -742,10 +742,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MI.setDesc(TII->get(Opc));
return true;
}
- case X86::PTTRANSPOSEDV: {
+ case X86::PTTRANSPOSEDV:
+ case X86::PTCONJTFP16V: {
for (int i = 2; i > 0; --i)
MI.removeOperand(i);
- MI.setDesc(TII->get(X86::TTRANSPOSED));
+ MI.setDesc(TII->get(Opcode == X86::PTTRANSPOSEDV ? X86::TTRANSPOSED
+ : X86::TCONJTFP16));
return true;
}
case X86::PTCMMIMFP16PSV:
@@ -755,7 +757,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::PTDPBUSDV:
case X86::PTDPBUUDV:
case X86::PTDPBF16PSV:
- case X86::PTDPFP16PSV: {
+ case X86::PTDPFP16PSV:
+ case X86::PTTDPBF16PSV:
+ case X86::PTTDPFP16PSV:
+ case X86::PTTCMMIMFP16PSV:
+ case X86::PTTCMMRLFP16PSV:
+ case X86::PTCONJTCMMIMFP16PSV: {
MI.untieRegOperand(4);
for (unsigned i = 3; i > 0; --i)
MI.removeOperand(i);
@@ -769,6 +776,21 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break;
case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break;
case X86::PTDPFP16PSV: Opc = X86::TDPFP16PS; break;
+ case X86::PTTDPBF16PSV:
+ Opc = X86::TTDPBF16PS;
+ break;
+ case X86::PTTDPFP16PSV:
+ Opc = X86::TTDPFP16PS;
+ break;
+ case X86::PTTCMMIMFP16PSV:
+ Opc = X86::TTCMMIMFP16PS;
+ break;
+ case X86::PTTCMMRLFP16PSV:
+ Opc = X86::TTCMMRLFP16PS;
+ break;
+ case X86::PTCONJTCMMIMFP16PSV:
+ Opc = X86::TCONJTCMMIMFP16PS;
+ break;
default:
llvm_unreachable("Unexpected Opcode");
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3888d207206ec8..a0faf4769dfc31 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37465,13 +37465,19 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::PTDPBUUD:
case X86::PTDPBF16PS:
case X86::PTDPFP16PS:
+ case X86::PTCMMIMFP16PS:
+ case X86::PTCMMRLFP16PS:
case X86::PTDPBF8PS:
case X86::PTDPBHF8PS:
case X86::PTDPHBF8PS:
- case X86::PTDPHF8PS: {
+ case X86::PTDPHF8PS:
+ case X86::PTTDPBF16PS:
+ case X86::PTTDPFP16PS:
+ case X86::PTTCMMIMFP16PS:
+ case X86::PTTCMMRLFP16PS:
+ case X86::PTCONJTCMMIMFP16PS: {
unsigned Opc;
switch (MI.getOpcode()) {
- // clang-format off
default: llvm_unreachable("illegal opcode!");
case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
@@ -37479,11 +37485,31 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
+ case X86::PTCMMIMFP16PS:
+ Opc = X86::TCMMIMFP16PS;
+ break;
+ case X86::PTCMMRLFP16PS:
+ Opc = X86::TCMMRLFP16PS;
+ break;
case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
- // clang-format on
+ case X86::PTTDPBF16PS:
+ Opc = X86::TTDPBF16PS;
+ break;
+ case X86::PTTDPFP16PS:
+ Opc = X86::TTDPFP16PS;
+ break;
+ case X86::PTTCMMIMFP16PS:
+ Opc = X86::TTCMMIMFP16PS;
+ break;
+ case X86::PTTCMMRLFP16PS:
+ Opc = X86::TTCMMRLFP16PS;
+ break;
+ case X86::PTCONJTCMMIMFP16PS:
+ Opc = X86::TCONJTCMMIMFP16PS;
+ break;
}
MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
@@ -37546,25 +37572,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
- case X86::PTCMMIMFP16PS:
- case X86::PTCMMRLFP16PS: {
- const MIMetadata MIMD(MI);
- unsigned Opc;
- switch (MI.getOpcode()) {
- // clang-format off
- default: llvm_unreachable("Unexpected instruction!");
- case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
- case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
- // clang-format on
- }
- MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
- MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
- MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
- MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
- MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
- MI.eraseFromParent(); // The pseudo is gone now.
- return BB;
- }
case X86::PT2RPNTLVWZ0:
case X86::PT2RPNTLVWZ0T1:
case X86::PT2RPNTLVWZ1:
@@ -37598,10 +37605,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent(); // The pseudo is gone now.
return BB;
}
- case X86::PTTRANSPOSED: {
+ case X86::PTTRANSPOSED:
+ case X86::PTCONJTFP16: {
const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
+ : X86::TCONJTFP16;
- MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::TTRANSPOSED));
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index b954c977f8c6c9..428d1823355b49 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -370,6 +370,95 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
}
} // HasAMXTILE, HasAMXTRANSPOSE
+let Predicates = [HasAMXBF16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
+ let Constraints = "$src1 = $dst" in
+ def TTDPBF16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "ttdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ []>, VEX, VVVV, T8,XS;
+ let Constraints = "$src4 = $dst" in
+ def PTTDPBF16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_ttdpbf16ps_internal GR16:$src1, GR16:$src2,
+ GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+ let usesCustomInserter = 1 in
+ def PTTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+ [(int_x86_ttdpbf16ps timm:$src1, timm:$src2, timm:$src3)]>;
+}
+
+let Predicates = [HasAMXFP16, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
+ let Constraints = "$src1 = $dst" in
+ def TTDPFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "ttdpfp16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ []>, VEX, VVVV, T8,XD;
+ let Constraints = "$src4 = $dst" in
+ def PTTDPFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_ttdpfp16ps_internal GR16:$src1, GR16:$src2,
+ GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+ let usesCustomInserter = 1 in
+ def PTTDPFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+ [(int_x86_ttdpfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
+}
+
+let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
+ let Constraints = "$src1 = $dst" in {
+ def TTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "ttcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+ []>, VEX, VVVV, T8,XD;
+ def TTCMMRLFP16PS: I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "ttcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+ []>, VEX, VVVV, T8,XS;
+ def TCONJTCMMIMFP16PS : I<0x6b, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tconjtcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+ []>, VEX, VVVV, WIG, T8,PS;
+ }
+ def TCONJTFP16 : I<0x6b, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
+ "tconjtfp16\t{$src, $dst|$dst, $src}", []>, VEX, T8,PD;
+
+ let Constraints = "$src4 = $dst" in {
+ def PTTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_ttcmmimfp16ps_internal GR16:$src1, GR16:$src2,
+ GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+ def PTTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_ttcmmrlfp16ps_internal GR16:$src1, GR16:$src2,
+ GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+ def PTCONJTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6),
+ [(set TILE: $dst,
+ (int_x86_tconjtcmmimfp16ps_internal GR16:$src1, GR16:$src2,
+ GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>;
+ }
+ def PTCONJTFP16V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, TILE:$src3),
+ [(set TILE: $dst, (int_x86_tconjtfp16_internal GR16:$src1, GR16:$src2, TILE:$src3))]>;
+
+ let usesCustomInserter = 1 in {
+ def PTTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+ [(int_x86_ttcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
+ def PTTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+ [(int_x86_ttcmmrlfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
+ def PTCONJTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+ [(int_x86_tconjtcmmimfp16ps timm:$src1, timm:$src2, timm:$src3)]>;
+ def PTCONJTFP16 : PseudoI<(outs), (ins u8imm:$dst, u8imm:$src),
+ [(int_x86_tconjtfp16 timm:$dst, timm:$src)]>;
+ }
+}
+
multiclass m_tcvtrowd2ps {
let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
let SchedRW = [WriteSystem] in {
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 08c065c39ee1e3..37a27bb7af3d9c 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -258,7 +258,8 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
}
break;
}
- case Intrinsic::x86_ttransposed_internal: {
+ case Intrinsic::x86_ttransposed_internal:
+ case Intrinsic::x86_tconjtfp16_internal: {
assert((OpNo == 2) && "Illegal Operand Number.");
Row = getRowFromCol(II, II->getArgOperand(1), 4);
Col = getColFromRow(II, II->getArgOperand(0), 4);
@@ -275,6 +276,27 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
Col = II->getArgOperand(1);
break;
}
+ case Intrinsic::x86_ttdpbf16ps_internal:
+ case Intrinsic::x86_ttdpfp16ps_internal:
+ case Intrinsic::x86_ttcmmimfp16ps_internal:
+ case Intrinsic::x86_ttcmmrlfp16ps_internal:
+ case Intrinsic::x86_tconjtcmmimfp16ps_internal: {
+ switch (OpNo) {
+ case 3:
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ case 4:
+ Row = getRowFromCol(II, II->getArgOperand(2), 4);
+ Col = getColFromRow(II, II->getArgOperand(0), 4);
+ break;
+ case 5:
+ Row = getRowFromCol(II, II->getArgOperand(2), 4);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ break;
+ }
}
return std::make_pair(Row, Col);
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 1b2192e3891fc5..d931ffb6c994b0 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1076,7 +1076,13 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
case X86::PTDPFP16PSV:
case X86::PTCMMIMFP16PSV:
case X86::PTCMMRLFP16PSV:
- case X86::PTTRANSPOSEDV: {
+ case X86::PTTRANSPOSEDV:
+ case X86::PTTDPBF16PSV:
+ case X86::PTTDPFP16PSV:
+ case X86::PTTCMMIMFP16PSV:
+ case X86::PTTCMMRLFP16PSV:
+ case X86::PTCONJTCMMIMFP16PSV:
+ case X86::PTCONJTFP16V: {
MachineOperand &MO1 = MI->getOperand(1);
MachineOperand &MO2 = MI->getOperand(2);
ShapeT Shape(&MO1, &MO2, MRI);
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
index 2025ee94a97405..cc4360317db7db 100644
--- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-tile,+amx-bf16,+amx-int8,+amx-transpose | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s
define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
; CHECK-LABEL: test_amx:
@@ -9,12 +9,24 @@ define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x floa
; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0
; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2
; CHECK-NEXT: ttransposed %tmm3, %tmm1
+; CHECK-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1
+; CHECK-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4
+; CHECK-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1
+; CHECK-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1
+; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1
+; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1
; CHECK-NEXT: retq
call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride)
call void @llvm.x86.ttransposed(i8 1, i8 3)
+ call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
+ call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
+ call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
+ call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
+ call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
+ call void @llvm.x86.tconjtfp16(i8 1, i8 2)
ret void
}
@@ -23,6 +35,63 @@ declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride)
declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride)
declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride)
declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1)
+declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C)
+declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C)
+declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C)
+declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B)
+
+define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 {
+; CHECK-LABEL: test_amx2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movw $8, %ax
+; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0
+; CHECK-NEXT: tilezero %tmm1
+; CHECK-NEXT: tilezero %tmm2
+; CHECK-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2
+; CHECK-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2
+; CHECK-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2
+; CHECK-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2
+; CHECK-NEXT: movabsq $64, %rbp
+; CHECK-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
+; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
+; CHECK-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3
+; CHECK-NEXT: tconjtfp16 %tmm3, %tmm0
+; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx)
+; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: tilerelease
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+
+ %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
+ %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
+ %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
+ %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
+ %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
+ %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b)
+ %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b)
+ %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b)
+ %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5)
+
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4)
+ ret void
+}
define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
; CHECK-LABEL: test_amx3:
@@ -146,5 +215,11 @@ declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64)
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64)
declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx)
+declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx)
attributes #0 = { nounwind }
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
index e4f1689639ef9a..8c6f1be80ba2dc 100644
--- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
+++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
@@ -56,3 +56,51 @@
# ATT: ttransposed %tmm2, %tmm3
# INTEL: ttransposed tmm3, tmm2
0xc4,0xe2,0x7a,0x5f,0xda
+
+# ATT: ttdpbf16ps %tmm7, %tmm6, %tmm5
+# INTEL: ttdpbf16ps tmm5, tmm6, tmm7
+0xc4,0xe2,0x42,0x6c,0xee
+
+# ATT: ttdpbf16ps %tmm1, %tmm2, %tmm3
+# INTEL: ttdpbf16ps tmm3, tmm2, tmm1
+0xc4,0xe2,0x72,0x6c,0xda
+
+# ATT: ttdpfp16ps %tmm7, %tmm6, %tmm5
+# INTEL: ttdpfp16ps tmm5, tmm6, tmm7
+0xc4,0xe2,0x43,0x6c,0xee
+
+# ATT: ttdpfp16ps %tmm1, %tmm2, %tmm3
+# INTEL: ttdpfp16ps tmm3, tmm2, tmm1
+0xc4,0xe2,0x73,0x6c,0xda
+
+# ATT: ttcmmimfp16ps %tmm4, %tmm5, %tmm6
+# INTEL: ttcmmimfp16ps tmm6, tmm5, tmm4
+0xc4,0xe2,0x5b,0x6b,0xf5
+
+# ATT: ttcmmimfp16ps %tmm1, %tmm2, %tmm3
+# INTEL: ttcmmimfp16ps tmm3, tmm2, tmm1
+0xc4,0xe2,0x73,0x6b,0xda
+
+# ATT: ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
+# INTEL: ttcmmrlfp16ps tmm6, tmm5, tmm4
+0xc4,0xe2,0x5a,0x6b,0xf5
+
+# ATT: ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
+# INTEL: ttcmmrlfp16ps tmm3, tmm2, tmm1
+0xc4,0xe2,0x72,0x6b,0xda
+
+# ATT: tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
+# INTEL: tconjtcmmimfp16ps tmm6, tmm5, tmm4
+0xc4,0xe2,0x58,0x6b,0xf5
+
+# ATT: tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
+# INTEL: tconjtcmmimfp16ps tmm3, tmm2, tmm1
+0xc4,0xe2,0x70,0x6b,0xda
+
+# ATT: tconjtfp16 %tmm5, %tmm6
+# INTEL: tconjtfp16 tmm6, tmm5
+0xc4,0xe2,0x79,0x6b,0xf5
+
+# ATT: tconjtfp16 %tmm2, %tmm3
+# INTEL: tconjtfp16 tmm3, tmm2
+0xc4,0xe2,0x79,0x6b,0xda
diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s
index da3fa95ef6dd06..21bbf258ac6ef8 100644
--- a/llvm/test/MC/X86/amx-transpose-att.s
+++ b/llvm/test/MC/X86/amx-transpose-att.s
@@ -55,3 +55,51 @@
// CHECK: ttransposed %tmm2, %tmm3
// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
ttransposed %tmm2, %tmm3
+
+// CHECK: ttdpbf16ps %tmm1, %tmm2, %tmm5
+// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xea]
+ ttdpbf16ps %tmm1, %tmm2, %tmm5
+
+// CHECK: ttdpbf16ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda]
+ ttdpbf16ps %tmm1, %tmm2, %tmm3
+
+// CHECK: ttdpfp16ps %tmm3, %tmm4, %tmm5
+// CHECK: encoding: [0xc4,0xe2,0x63,0x6c,0xec]
+ ttdpfp16ps %tmm3, %tmm4, %tmm5
+
+// CHECK: ttdpfp16ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda]
+ ttdpfp16ps %tmm1, %tmm2, %tmm3
+
+// CHECK: ttcmmimfp16ps %tmm4, %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5]
+ ttcmmimfp16ps %tmm4, %tmm5, %tmm6
+
+// CHECK: ttcmmimfp16ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda]
+ ttcmmimfp16ps %tmm1, %tmm2, %tmm3
+
+// CHECK: ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5]
+ ttcmmrlfp16ps %tmm4, %tmm5, %tmm6
+
+// CHECK: ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda]
+ ttcmmrlfp16ps %tmm1, %tmm2, %tmm3
+
+// CHECK: tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5]
+ tconjtcmmimfp16ps %tmm4, %tmm5, %tmm6
+
+// CHECK: tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda]
+ tconjtcmmimfp16ps %tmm1, %tmm2, %tmm3
+
+// CHECK: tconjtfp16 %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5]
+ tconjtfp16 %tmm5, %tmm6
+
+// CHECK: tconjtfp16 %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda]
+ tconjtfp16 %tmm2, %tmm3
diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s
index 3b8dfaed313d61..a772232ddbbf2e 100644
--- a/llvm/test/MC/X86/amx-transpose-intel.s
+++ b/llvm/test/MC/X86/amx-transpose-intel.s
@@ -55,3 +55,51 @@
// CHECK: ttransposed tmm3, tmm2
// CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xda]
ttransposed tmm3, tmm2
+
+// CHECK: ttdpbf16ps tmm5, tmm0, tmm4
+// CHECK: encoding: [0xc4,0xe2,0x5a,0x6c,0xe8]
+ ttdpbf16ps tmm5, tmm0, tmm4
+
+// CHECK: ttdpbf16ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe2,0x72,0x6c,0xda]
+ ttdpbf16ps tmm3, tmm2, tmm1
+
+// CHECK: ttdpfp16ps tmm1, tmm0, tmm4
+// CHECK: encoding: [0xc4,0xe2,0x5b,0x6c,0xc8]
+ ttdpfp16ps tmm1, tmm0, tmm4
+
+// CHECK: ttdpfp16ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe2,0x73,0x6c,0xda]
+ ttdpfp16ps tmm3, tmm2, tmm1
+
+// CHECK: ttcmmimfp16ps tmm6, tmm5, tmm4
+// CHECK: encoding: [0xc4,0xe2,0x5b,0x6b,0xf5]
+ ttcmmimfp16ps tmm6, tmm5, tmm4
+
+// CHECK: ttcmmimfp16ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe2,0x73,0x6b,0xda]
+ ttcmmimfp16ps tmm3, tmm2, tmm1
+
+// CHECK: ttcmmrlfp16ps tmm6, tmm5, tmm4
+// CHECK: encoding: [0xc4,0xe2,0x5a,0x6b,0xf5]
+ ttcmmrlfp16ps tmm6, tmm5, tmm4
+
+// CHECK: ttcmmrlfp16ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe2,0x72,0x6b,0xda]
+ ttcmmrlfp16ps tmm3, tmm2, tmm1
+
+// CHECK: tconjtcmmimfp16ps tmm6, tmm5, tmm4
+// CHECK: encoding: [0xc4,0xe2,0x58,0x6b,0xf5]
+ tconjtcmmimfp16ps tmm6, tmm5, tmm4
+
+// CHECK: tconjtcmmimfp16ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe2,0x70,0x6b,0xda]
+ tconjtcmmimfp16ps tmm3, tmm2, tmm1
+
+// CHECK: tconjtfp16 tmm6, tmm5
+// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xf5]
+ tconjtfp16 tmm6, tmm5
+
+// CHECK: tconjtfp16 tmm3, tmm2
+// CHECK: encoding: [0xc4,0xe2,0x79,0x6b,0xda]
+ tconjtfp16 tmm3, tmm2
More information about the cfe-commits
mailing list