[clang] [llvm] [X86][AMX] Support AMX-TRANSPOSE, part 2 (PR #115660)

Sun Nov 10 06:49:25 PST 2024

llvmbot wrote:



@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

<details>
<summary>Changes</summary>

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

---

Patch is 68.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115660.diff


22 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsX86_64.def (+12) 
- (modified) clang/lib/Headers/CMakeLists.txt (+3) 
- (added) clang/lib/Headers/amxbf16transposeintrin.h (+94) 
- (added) clang/lib/Headers/amxcomplextransposeintrin.h (+301) 
- (modified) clang/lib/Headers/amxfp16intrin.h (+35) 
- (added) clang/lib/Headers/amxfp16transposeintrin.h (+94) 
- (modified) clang/lib/Headers/amxintrin.h (-32) 
- (modified) clang/lib/Headers/immintrin.h (+19-3) 
- (modified) clang/lib/Sema/SemaX86.cpp (+6) 
- (modified) clang/test/CodeGen/X86/amx_transpose.c (+39) 
- (modified) clang/test/CodeGen/X86/amx_transpose_api.c (+49-1) 
- (modified) clang/test/CodeGen/X86/amx_transpose_errors.c (+47-3) 
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+57) 
- (modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+25-3) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+34-24) 
- (modified) llvm/lib/Target/X86/X86InstrAMX.td (+89) 
- (modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+23-1) 
- (modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+7-1) 
- (modified) llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll (+76-1) 
- (modified) llvm/test/MC/Disassembler/X86/amx-transpose-att.txt (+48) 
- (modified) llvm/test/MC/X86/amx-transpose-att.s (+48) 
- (modified) llvm/test/MC/X86/amx-transpose-intel.s (+48) 


``````````diff

diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..cc8637ed9c50da 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z",
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16_internal, "V256iUsUsV256i", "n", "amx-complex,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
@@ -164,6 +170,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16, "vIUcIUc", "n", "amx-complex,amx-transpose")
 
 TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512")
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 76366ca1f108e9..19013d37f46ef7 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,8 +147,11 @@ set(x86_files
   adxintrin.h
   ammintrin.h
   amxavx512intrin.h
+  amxbf16transposeintrin.h
   amxcomplexintrin.h
+  amxcomplextransposeintrin.h
   amxfp16intrin.h
+  amxfp16transposeintrin.h
   amxfp8intrin.h
   amxintrin.h
   amxtransposeintrin.h
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
new file mode 100644
index 00000000000000..7d31384e317988
--- /dev/null
+++ b/clang/lib/Headers/amxbf16transposeintrin.h
@@ -0,0 +1,94 @@
+/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_BF16TRANSPOSEINTRIN_H
+#define __AMX_BF16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-bf16,amx-transpose")))
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+///    tiles \a a and \a b, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in \a dst, and store the
+///    32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
+///					FP32(b.row[k].bf16[2*n+0])
+///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
+///					FP32(b.row[k].bf16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+///    tiles src0 and src1, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in "dst", and store the
+///    32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
+                                        __tile1024i src1) {
+  dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                       src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
new file mode 100644
index 00000000000000..06fb53e4deadcd
--- /dev/null
+++ b/clang/lib/Headers/amxcomplextransposeintrin.h
@@ -0,0 +1,301 @@
+/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (transposed column of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The imaginary part of the \a a element
+///    is multiplied with the real part of the corresponding \a b element, and
+///    the real part of the \a a element is multiplied with the imaginary part
+///    of the corresponding \a b elements. The two accumulated results are
+///    added, and then accumulated into the corresponding row and column of
+///    \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///    of (rtransposed colum of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The real part of the \a a element is
+///    multiplied with the real part of the corresponding \a b element, and the
+///    negated imaginary part of the \a a element is multiplied with the
+///    imaginary part of the corresponding \a b elements. The two accumulated
+///    results are added, and then accumulated into the corresponding row and
+///    column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///    containing complex elements and accumulate the results into a packed
+///    single precision tile. Each dword element in input tiles \a a and \a b
+///    is interpreted as a complex number with FP16 real part and FP16 imaginary
+///    part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (transposed column of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The negated imaginary part of the \a a
+///    element is multiplied with the real part of the corresponding \a b
+///    element, and the real part of the \a a element is multiplied with the
+///    imaginary part of the corresponding \a b elements. The two accumulated
+///    results are added, and then accumulated into the corresponding row and
+///    column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_conjtcmmimfp16ps(dst, a, b)                                      \
+  __builtin_ia32_tconjtcmmimfp16ps(dst, a, b)
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
+///    and writes the result to \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtfp16(__tile dst, __tile a);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR i := 0 TO dst.rows - 1
+///	FOR j := 0 TO (dst.colsb / 4) - 1
+///		tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
+///		tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
+///	ENDFOR
+///	write_row_and_zero(dst, i, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The source tile. Max size is 1024 Bytes.
+#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16(dst, a)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtfp16_internal(
+    unsigned short m, unsigned short n, _tile1024i src) {
+  return __builtin_ia32_tconjtfp16_internal(m, n, src);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles src0 and src1 is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+///    This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                                __tile1024i src1) {
+  dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                          dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles src0 and src1 is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+///    This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+                                __tile1024i src1) {
+  dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+                                          dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///    containing complex elements and accumulate the results into a packed
+///    single precision tile. Each dword element in input tiles src0 and src1
+///    is interpreted as a complex number with FP16 real part and FP16 imaginary
+///    part.
+///    This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                                    __tile1024i src1) {
+  dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                              dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from src and
+///    writes the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src
+///    The source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
+  dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxfp16intrin.h b/clang/lib/Headers/amxfp16intrin.h
index ed798245d41efb..bb4bc31fdafd50 100644
--- a/clang/lib/Headers/amxfp16intrin.h
+++ b/clang/lib/Headers/amxfp16intrin.h
@@ -15,6 +15,10 @@
 #define __AMX_FP16INTRIN_H
 #ifdef __x86_64__
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
+
 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
 ///    and \a b, accumulating the intermediate single-precision (32-bit)
 ///    floating-point elements with elements in \a dst, and store the 32-bit
@@ -54,5 +58,36 @@
 #define _tile_dpfp16ps(dst, a, b)                                \
   __builtin_ia32_tdpfp16ps(dst, a, b)
 
+/// This is internal intrinsic. C/C++ user should avoid calli...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/115660