[clang] [llvm] [X86][AMX] Support AMX-MOVRS (PR #115151)

Wed Nov 6 04:09:38 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-clang-codegen

Author: Malay Sanghi (MalaySanghi)

<details>
<summary>Changes</summary>

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

---

Patch is 81.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115151.diff


31 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsX86_64.def (+14) 
- (modified) clang/include/clang/Driver/Options.td (+2) 
- (modified) clang/lib/Basic/Targets/X86.cpp (+6) 
- (modified) clang/lib/Basic/Targets/X86.h (+1) 
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+17-1) 
- (modified) clang/lib/Headers/CMakeLists.txt (+1) 
- (added) clang/lib/Headers/amxmovrsintrin.h (+48) 
- (modified) clang/lib/Headers/amxtransposeintrin.h (+177) 
- (modified) clang/lib/Headers/immintrin.h (+4) 
- (modified) clang/lib/Sema/SemaX86.cpp (+6) 
- (added) clang/test/CodeGen/X86/amx_movrs.c (+25) 
- (added) clang/test/CodeGen/X86/amx_movrs_api.c (+34) 
- (added) clang/test/CodeGen/X86/amx_movrs_errors.c (+14) 
- (added) clang/test/CodeGen/X86/amx_movrs_tranpose.c (+53) 
- (added) clang/test/CodeGen/X86/amx_movrs_tranpose_api.c (+81) 
- (added) clang/test/CodeGen/X86/amx_movrs_transpose_errors.c (+22) 
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+48) 
- (modified) llvm/lib/Target/X86/X86.td (+3) 
- (modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+35) 
- (modified) llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (+107-2) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+81) 
- (modified) llvm/lib/Target/X86/X86InstrAMX.td (+91) 
- (modified) llvm/lib/Target/X86/X86InstrInfo.cpp (+1) 
- (modified) llvm/lib/Target/X86/X86InstrPredicates.td (+1) 
- (modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+7-1) 
- (modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+8-2) 
- (added) llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll (+108) 
- (added) llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll (+92) 
- (added) llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt (+98) 
- (added) llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s (+89) 
- (added) llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s (+97) 


``````````diff

diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index d95e8455a304b6..98235023bddc7b 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr")
 // AMX internal builtin
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs")
 TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs")
 TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
@@ -129,15 +131,27 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i",
 TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
 TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+
+TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs")
 
 TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 805b79491e6ea4..c36adb673dd4ea 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
 def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
 def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
+def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
+def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
 def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
 def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index d7d3adef42c79a..0ddc1ac4c47f22 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAMXCOMPLEX = true;
     } else if (Feature == "+amx-fp8") {
       HasAMXFP8 = true;
+    } else if (Feature == "+amx-movrs") {
+      HasAMXMOVRS = true;
     } else if (Feature == "+amx-transpose") {
       HasAMXTRANSPOSE = true;
     } else if (Feature == "+cmpccxadd") {
@@ -953,6 +955,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AMX_COMPLEX__");
   if (HasAMXFP8)
     Builder.defineMacro("__AMX_FP8__");
+  if (HasAMXMOVRS)
+    Builder.defineMacro("__AMX_MOVRS__");
   if (HasAMXTRANSPOSE)
     Builder.defineMacro("__AMX_TRANSPOSE__");
   if (HasCMPCCXADD)
@@ -1085,6 +1089,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("amx-fp16", true)
       .Case("amx-fp8", true)
       .Case("amx-int8", true)
+      .Case("amx-movrs", true)
       .Case("amx-tile", true)
       .Case("amx-transpose", true)
       .Case("avx", true)
@@ -1205,6 +1210,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("amx-fp16", HasAMXFP16)
       .Case("amx-fp8", HasAMXFP8)
       .Case("amx-int8", HasAMXINT8)
+      .Case("amx-movrs", HasAMXMOVRS)
       .Case("amx-tile", HasAMXTILE)
       .Case("amx-transpose", HasAMXTRANSPOSE)
       .Case("avx", SSELevel >= AVX)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index e2eba63b992355..54a078d2f137b6 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAMXBF16 = false;
   bool HasAMXCOMPLEX = false;
   bool HasAMXFP8 = false;
+  bool HasAMXMOVRS = false;
   bool HasAMXTRANSPOSE = false;
   bool HasSERIALIZE = false;
   bool HasTSXLDTRK = false;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 34fedd67114751..02ee0132bbb5eb 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16996,9 +16996,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   }
   // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
   case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
   case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
   case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
+  case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
     Intrinsic::ID IID;
     switch (BuiltinID) {
     default:
@@ -17006,15 +17010,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
       IID = Intrinsic::x86_t2rpntlvwz0_internal;
       break;
+    case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
+      IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
+      break;
     case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
       IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
       break;
+    case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
+      IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
+      break;
     case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
       IID = Intrinsic::x86_t2rpntlvwz1_internal;
       break;
+    case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
+      IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
+      break;
     case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
       IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
       break;
+    case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
+      IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
+      break;
     }
 
     // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 67242cd4d981bc..a50cf01eac6fef 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -151,6 +151,7 @@ set(x86_files
   amxfp8intrin.h
   amxintrin.h
   amxtransposeintrin.h
+  amxmovrsintrin.h
   avx10_2_512bf16intrin.h
   avx10_2_512convertintrin.h
   avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxmovrsintrin.h b/clang/lib/Headers/amxmovrsintrin.h
new file mode 100644
index 00000000000000..5fe2fdecb8b5dd
--- /dev/null
+++ b/clang/lib/Headers/amxmovrsintrin.h
@@ -0,0 +1,48 @@
+/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-------------------------------------------------------------------=== */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXMOVRSINTRIN_H
+#define __AMXMOVRSINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_MOVRS                                               \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs")))
+
+#define _tile_loaddrs(dst, base, stride)                                       \
+  __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)),                  \
+                               (__SIZE_TYPE__)(stride))
+#define _tile_stream_loaddrs(dst, base, stride)                                \
+  __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)),                \
+                                 (__SIZE_TYPE__)(stride))
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base,
+                       __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddrs64_internal(m, n, base,
+                                               (__SIZE_TYPE__)(stride));
+}
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base,
+                         __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddrst164_internal(m, n, base,
+                                                 (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS
+__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride);
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs(
+    __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride);
+}
+#undef __DEFAULT_FN_ATTRS_MOVRS
+#endif /* __x86_64__ */
+#endif /* __AMXMOVRSINTRIN_H */
diff --git a/clang/lib/Headers/amxtransposeintrin.h b/clang/lib/Headers/amxtransposeintrin.h
index b3fa37d766c45b..086c9a75222ca1 100644
--- a/clang/lib/Headers/amxtransposeintrin.h
+++ b/clang/lib/Headers/amxtransposeintrin.h
@@ -17,6 +17,9 @@
 
 #define __DEFAULT_FN_ATTRS_TRANSPOSE                                           \
   __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+#define __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-transpose,amx-movrs")))
 
 #define _tile_2rpntlvwz0(tdst, base, stride)                                   \
   __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
@@ -26,6 +29,15 @@
   __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
 #define _tile_2rpntlvwz1t1(tdst, base, stride)                                 \
   __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+// MOVRS versions
+#define _tile_2rpntlvwz0rs(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
+#define _tile_2rpntlvwz0rst1(tdst, base, stride)                               \
+  __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
+#define _tile_2rpntlvwz1rs(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
+#define _tile_2rpntlvwz1rst1(tdst, base, stride)                               \
+  __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
 
 /// Transpose 32-bit elements from \a src and write the result to \a dst.
 ///
@@ -101,6 +113,45 @@ _tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
   return __builtin_ia32_ttransposed_internal(m, n, src);
 }
 
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+_tile_2rpntlvwz0rs_internal(unsigned short row, unsigned short col0,
+                            unsigned short col1, _tile1024i *dst0,
+                            _tile1024i *dst1, const void *base,
+                            __SIZE_TYPE__ stride) {
+  // Use __tile1024i_1024a* to escape the alignment check in
+  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+  __builtin_ia32_t2rpntlvwz0rs_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+_tile_2rpntlvwz0rst1_internal(unsigned short row, unsigned short col0,
+                              unsigned short col1, _tile1024i *dst0,
+                              _tile1024i *dst1, const void *base,
+                              __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz0rst1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+_tile_2rpntlvwz1rs_internal(unsigned short row, unsigned short col0,
+                            unsigned short col1, _tile1024i *dst0,
+                            _tile1024i *dst1, const void *base,
+                            __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1rs_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+_tile_2rpntlvwz1rst1_internal(unsigned short row, unsigned short col0,
+                              unsigned short col1, _tile1024i *dst0,
+                              _tile1024i *dst1, const void *base,
+                              __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1rst1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
 /// Converts a pair of tiles from memory into VNNI format, and places the
 /// results in a pair of destinations specified by dst. The pair of tiles
 /// in memory is specified via a tsib; the second tile is after the first
@@ -229,6 +280,131 @@ static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
                               &dst1->tile, base, stride);
 }
 
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
+                                  const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                                &dst1->tile, base, stride);
+}
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE_MOVRS
+static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/115151