[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE (PR #113532)
Feng Zou via cfe-commits
cfe-commits at lists.llvm.org
Thu Oct 31 08:35:32 PDT 2024
================
@@ -0,0 +1,248 @@
+/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_TRANSPOSEINTRIN_H
+#define __AMX_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TRANSPOSE \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+
+#define _tile_2rpntlvwz0(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
+#define _tile_2rpntlvwz0t1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
+#define _tile_2rpntlvwz1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
+#define _tile_2rpntlvwz1t1(tdst, base, stride) \
+ __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+
+/// Transpose 32-bit elements from \a src and write the result to \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void __tile_transposed(__tile dst, __tile src);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// The 1st source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+///
+/// FOR i := 0 TO (dst.rows-1)
+/// tmp[511:0] := 0
+/// FOR j := 0 TO (dst.colsb/4-1)
+/// tmp.dword[j] := src.row[j].dword[i]
+/// ENDFOR
+/// dst.row[i] := tmp
+/// ENDFOR
+///
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ // Use __tile1024i_1024a* to escape the alignment check in
+ // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+ __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+ (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz0t1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+ (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
+ unsigned short row, unsigned short col0, unsigned short col1,
+ _tile1024i *dst0, _tile1024i *dst1, const void *base,
+ __SIZE_TYPE__ stride) {
+ __builtin_ia32_t2rpntlvwz1t1_internal(
+ row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
+_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
+ return __builtin_ia32_ttransposed_internal(m, n, src);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
+///
+/// \param dst0
+/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
+ const void *base, __SIZE_TYPE__ stride) {
+ _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+ &dst1->tile, base, stride);
+}
+
+/// Transpose 32-bit elements from src and write the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// The 1st source tile. Max size is 1024 Bytes.
----------------
fzou1 wrote:
Remove "1st" as above.
https://github.com/llvm/llvm-project/pull/113532
More information about the cfe-commits
mailing list