[clang] [llvm] [X86][AMX] Support AMX-AVX512 (PR #114070)

Phoebe Wang via cfe-commits cfe-commits at lists.llvm.org
Wed Nov 6 01:56:40 PST 2024


================
@@ -0,0 +1,381 @@
+/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_AVX512INTRIN_H
+#define __AMX_AVX512INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_AVX512                                              \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-avx512")))
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///    the int32 source elements to fp32. The row of the tile is selected by an
+///    32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+///     IF i + row_chunk / 4 >= tsrc.colsb / 4
+///         dst.dword[i] := 0
+///     ELSE
+///         dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
+///     FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
+///
+/// \param tsrc
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param row
+///    The row of the source tile
+#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///    the fp32 source elements to bf16. It places the resulting bf16 elements
+///    in the high 16 bits within each dword. The row of the tile is selected
+///    by an 32b GPR.
----------------
phoebewang wrote:

Done.

https://github.com/llvm/llvm-project/pull/114070


More information about the cfe-commits mailing list