[clang] f77101e - [X86][AMX] Support AMX-MOVRS (#115151)

via cfe-commits cfe-commits at lists.llvm.org
Mon Nov 11 23:05:49 PST 2024


Author: Malay Sanghi
Date: 2024-11-12T15:05:43+08:00
New Revision: f77101ea7913ab6a6b28ad03c152c615a89900f6

URL: https://github.com/llvm/llvm-project/commit/f77101ea7913ab6a6b28ad03c152c615a89900f6
DIFF: https://github.com/llvm/llvm-project/commit/f77101ea7913ab6a6b28ad03c152c615a89900f6.diff

LOG: [X86][AMX] Support AMX-MOVRS (#115151)

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

Added: 
    clang/lib/Headers/amxmovrsintrin.h
    clang/lib/Headers/amxmovrstransposeintrin.h
    clang/test/CodeGen/X86/amx_movrs.c
    clang/test/CodeGen/X86/amx_movrs_api.c
    clang/test/CodeGen/X86/amx_movrs_errors.c
    clang/test/CodeGen/X86/amx_movrs_tranpose.c
    clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
    clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
    llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
    llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
    llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
    llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
    llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s

Modified: 
    clang/docs/ReleaseNotes.rst
    clang/include/clang/Basic/BuiltinsX86_64.def
    clang/include/clang/Driver/Options.td
    clang/lib/Basic/Targets/X86.cpp
    clang/lib/Basic/Targets/X86.h
    clang/lib/CodeGen/CGBuiltin.cpp
    clang/lib/Headers/CMakeLists.txt
    clang/lib/Headers/immintrin.h
    clang/lib/Sema/SemaX86.cpp
    llvm/include/llvm/IR/IntrinsicsX86.td
    llvm/include/llvm/TargetParser/X86TargetParser.def
    llvm/lib/Target/X86/X86.td
    llvm/lib/Target/X86/X86ExpandPseudo.cpp
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86InstrAMX.td
    llvm/lib/Target/X86/X86InstrPredicates.td
    llvm/lib/Target/X86/X86LowerAMXType.cpp
    llvm/lib/Target/X86/X86RegisterInfo.cpp
    llvm/lib/TargetParser/Host.cpp
    llvm/lib/TargetParser/X86TargetParser.cpp

Removed: 
    


################################################################################
diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4ef48bed58d95c..c8aa053986cc00 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -741,6 +741,7 @@ X86 Support
   * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
 - Support ISA of ``AMX-FP8``.
 - Support ISA of ``AMX-TRANSPOSE``.
+- Support ISA of ``AMX-MOVRS``.
 - Support ISA of ``AMX-AVX512``.
 - Support ISA of ``AMX-TF32``.
 

diff  --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 25c10d39df32e2..f853b4313dae07 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -117,7 +117,9 @@ TARGET_BUILTIN(__builtin_ia32_uwrmsr, "vULLiULLi", "n", "usermsr")
 // AMX internal builtin
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrs64_internal, "V256iUsUsvC*z", "n", "amx-movrs")
 TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrst164_internal, "V256iUsUsvC*z", "n", "amx-movrs")
 TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
 TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
@@ -129,10 +131,15 @@ TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i",
 TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
 TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-movrs,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+
 TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
@@ -147,6 +154,13 @@ TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rs, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1rst1, "vIUcvC*z", "n", "amx-movrs,amx-transpose")
+
+TARGET_BUILTIN(__builtin_ia32_tileloaddrs64, "vIUcvC*z", "n", "amx-movrs")
+TARGET_BUILTIN(__builtin_ia32_tileloaddrst164, "vIUcvC*z", "n", "amx-movrs")
 
 TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile")

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1304ef3c5a228b..9fb7f8bb6489b0 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6303,6 +6303,8 @@ def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
 def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
 def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
+def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
+def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
 def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
 def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;

diff  --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index dc85e9aa77cd3d..086b4415412e67 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAMXCOMPLEX = true;
     } else if (Feature == "+amx-fp8") {
       HasAMXFP8 = true;
+    } else if (Feature == "+amx-movrs") {
+      HasAMXMOVRS = true;
     } else if (Feature == "+amx-transpose") {
       HasAMXTRANSPOSE = true;
     } else if (Feature == "+amx-avx512") {
@@ -957,6 +959,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AMX_COMPLEX__");
   if (HasAMXFP8)
     Builder.defineMacro("__AMX_FP8__");
+  if (HasAMXMOVRS)
+    Builder.defineMacro("__AMX_MOVRS__");
   if (HasAMXTRANSPOSE)
     Builder.defineMacro("__AMX_TRANSPOSE__");
   if (HasAMXAVX512)
@@ -1094,6 +1098,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("amx-fp16", true)
       .Case("amx-fp8", true)
       .Case("amx-int8", true)
+      .Case("amx-movrs", true)
       .Case("amx-tf32", true)
       .Case("amx-tile", true)
       .Case("amx-transpose", true)
@@ -1216,6 +1221,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("amx-fp16", HasAMXFP16)
       .Case("amx-fp8", HasAMXFP8)
       .Case("amx-int8", HasAMXINT8)
+      .Case("amx-movrs", HasAMXMOVRS)
       .Case("amx-tf32", HasAMXTF32)
       .Case("amx-tile", HasAMXTILE)
       .Case("amx-transpose", HasAMXTRANSPOSE)

diff  --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 04b1d5d33ea231..06a7eed8177cb2 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAMXBF16 = false;
   bool HasAMXCOMPLEX = false;
   bool HasAMXFP8 = false;
+  bool HasAMXMOVRS = false;
   bool HasAMXTRANSPOSE = false;
   bool HasAMXAVX512 = false;
   bool HasAMXTF32 = false;

diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 430ac5626f89d7..0807542825f634 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17025,9 +17025,13 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   }
   // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
   case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
   case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
   case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
-  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
+  case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
     Intrinsic::ID IID;
     switch (BuiltinID) {
     default:
@@ -17035,15 +17039,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
       IID = Intrinsic::x86_t2rpntlvwz0_internal;
       break;
+    case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
+      IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
+      break;
     case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
       IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
       break;
+    case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
+      IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
+      break;
     case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
       IID = Intrinsic::x86_t2rpntlvwz1_internal;
       break;
+    case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
+      IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
+      break;
     case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
       IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
       break;
+    case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
+      IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
+      break;
     }
 
     // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)

diff  --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index a094305bcec5e4..7227df93edece4 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -151,6 +151,8 @@ set(x86_files
   amxfp16intrin.h
   amxfp8intrin.h
   amxintrin.h
+  amxmovrsintrin.h
+  amxmovrstransposeintrin.h
   amxtf32intrin.h
   amxtf32transposeintrin.h
   amxtransposeintrin.h

diff  --git a/clang/lib/Headers/amxmovrsintrin.h b/clang/lib/Headers/amxmovrsintrin.h
new file mode 100644
index 00000000000000..5fe2fdecb8b5dd
--- /dev/null
+++ b/clang/lib/Headers/amxmovrsintrin.h
@@ -0,0 +1,48 @@
+/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-------------------------------------------------------------------=== */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXMOVRSINTRIN_H
+#define __AMXMOVRSINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_MOVRS                                               \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs")))
+
+#define _tile_loaddrs(dst, base, stride)                                       \
+  __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)),                  \
+                               (__SIZE_TYPE__)(stride))
+#define _tile_stream_loaddrs(dst, base, stride)                                \
+  __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)),                \
+                                 (__SIZE_TYPE__)(stride))
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base,
+                       __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddrs64_internal(m, n, base,
+                                               (__SIZE_TYPE__)(stride));
+}
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base,
+                         __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddrst164_internal(m, n, base,
+                                                 (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS
+__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride);
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs(
+    __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride);
+}
+#undef __DEFAULT_FN_ATTRS_MOVRS
+#endif /* __x86_64__ */
+#endif /* __AMXMOVRSINTRIN_H */

diff  --git a/clang/lib/Headers/amxmovrstransposeintrin.h b/clang/lib/Headers/amxmovrstransposeintrin.h
new file mode 100644
index 00000000000000..17a9f7506a0421
--- /dev/null
+++ b/clang/lib/Headers/amxmovrstransposeintrin.h
@@ -0,0 +1,200 @@
+/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
+#define __AMX_MOVRS_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-transpose,amx-movrs")))
+
+#define _tile_2rpntlvwz0rs(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
+#define _tile_2rpntlvwz0rst1(tdst, base, stride)                               \
+  __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
+#define _tile_2rpntlvwz1rs(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
+#define _tile_2rpntlvwz1rst1(tdst, base, stride)                               \
+  __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  // Use __tile1024i_1024a* to escape the alignment check in
+  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+  __builtin_ia32_t2rpntlvwz0rs_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz0rst1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1rs_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1rst1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
+                                  const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                                &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
+                                  const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                                &dst1->tile, base, stride);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#endif /* __x86_64__ */
+#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */
\ No newline at end of file

diff  --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 87a502238ae162..f0dd7160ec7ff4 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -656,6 +656,15 @@ _storebe_i64(void * __P, long long __D) {
 #include <amxtransposeintrin.h>
 #endif
 
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__)
+#include <amxmovrsintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__))
+#include <amxmovrstransposeintrin.h>
+#endif
+
 #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
 #include <amxavx512intrin.h>
 #endif

diff  --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index d7c8ed351f410a..85b195491025d3 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -629,12 +629,18 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
     return false;
   case X86::BI__builtin_ia32_tileloadd64:
   case X86::BI__builtin_ia32_tileloaddt164:
+  case X86::BI__builtin_ia32_tileloaddrs64:
+  case X86::BI__builtin_ia32_tileloaddrst164:
   case X86::BI__builtin_ia32_tilestored64:
   case X86::BI__builtin_ia32_tilezero:
   case X86::BI__builtin_ia32_t2rpntlvwz0:
   case X86::BI__builtin_ia32_t2rpntlvwz0t1:
   case X86::BI__builtin_ia32_t2rpntlvwz1:
   case X86::BI__builtin_ia32_t2rpntlvwz1t1:
+  case X86::BI__builtin_ia32_t2rpntlvwz0rst1:
+  case X86::BI__builtin_ia32_t2rpntlvwz1rs:
+  case X86::BI__builtin_ia32_t2rpntlvwz1rst1:
+  case X86::BI__builtin_ia32_t2rpntlvwz0rs:
   case X86::BI__builtin_ia32_tcvtrowps2pbf16h:
   case X86::BI__builtin_ia32_tcvtrowps2pbf16l:
   case X86::BI__builtin_ia32_tcvtrowps2phh:

diff  --git a/clang/test/CodeGen/X86/amx_movrs.c b/clang/test/CodeGen/X86/amx_movrs.c
new file mode 100755
index 00000000000000..4a8f001baafcea
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -target-feature +avx512f \
+// RUN: -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+#define STRIDE 32
+
+char buf[1024];
+
+void test_tile_loadd(short row, short col) {
+  // CHECK-LABEL: define dso_local void @test_tile_loadd(
+  // CHECK:    call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+  // CHECK-NEXT:    call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+  _tile_loaddrs_internal(row, col, buf, STRIDE);
+}
+
+void test_tile_loaddt1(short row, short col) {
+  // CHECK-LABEL: define dso_local void @test_tile_loaddt1(
+  // CHECK:    call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+  // CHECK-NEXT:    call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+  _tile_loaddrst1_internal(row, col, buf, STRIDE);
+}

diff  --git a/clang/test/CodeGen/X86/amx_movrs_api.c b/clang/test/CodeGen/X86/amx_movrs_api.c
new file mode 100755
index 00000000000000..cf430adf140852
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_api.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs  -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+#define STRIDE 32
+
+char buf[1024];
+
+void test_tile_loadd(short row) {
+  // CHECK-LABEL: define dso_local void @test_tile_loadd(
+  // CHECK:    call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+  // CHECK-NEXT:    call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+  __tile1024i a = {row, 8};
+  __tile_loaddrs(&a, buf, STRIDE);
+}
+
+void test_tile_loaddt1(short row) {
+  // CHECK-LABEL: define dso_local void @test_tile_loaddt1(
+  // CHECK:    call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+  // CHECK-NEXT:    call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+  __tile1024i a = {row, 8};
+  __tile_stream_loaddrs(&a, buf, STRIDE);
+}
+
+void test_tile_loadd_macro(void *data) {
+  // CHECK-LABEL: define dso_local void @test_tile_loadd_macro(
+  // CHECK:    call void  @llvm.x86.tileloaddrs64(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}})
+  // CHECK:    call void  @llvm.x86.tileloaddrst164(i8 {{.*}}, ptr %{{.*}}, i64 {{.*}})
+  _tile_loaddrs(4, data, STRIDE);
+  _tile_stream_loaddrs(2, data, STRIDE);
+}

diff  --git a/clang/test/CodeGen/X86/amx_movrs_errors.c b/clang/test/CodeGen/X86/amx_movrs_errors.c
new file mode 100755
index 00000000000000..495ea299236e2f
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_errors.c
@@ -0,0 +1,13 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs -verify
+
+#include <immintrin.h>
+#include <stddef.h>
+
+char buf[1024];
+
+void test_tile_load() {
+  _tile_loaddrs(20, buf, 32); // expected-error {{argument value 20 is outside the valid range [0, 7]}}
+  _tile_stream_loaddrs(-1, buf, 20); // expected-error {{argument value 255 is outside the valid range [0, 7]}}
+}

diff  --git a/clang/test/CodeGen/X86/amx_movrs_tranpose.c b/clang/test/CodeGen/X86/amx_movrs_tranpose.c
new file mode 100755
index 00000000000000..192c153835e1e6
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_tranpose.c
@@ -0,0 +1,53 @@
+// RUN:  %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN:  -target-feature +amx-movrs  -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN:  -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+char buf[2048];
+#define STRIDE 32
+
+// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz0rs_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz0rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+  _tile_2rpntlvwz0rs_internal(row, col0, col1, D0, D1, B, 1);
+}
+
+// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz0rst1_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz0rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+  _tile_2rpntlvwz0rst1_internal(row, col0, col1, D0, D1, B, 1);
+}
+
+// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz1rs_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz1rs_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+  _tile_2rpntlvwz1rs_internal(row, col0, col1, D0, D1, B, 1);
+}
+
+// CHECK-LABEL:  define dso_local void @test_tile_2rpntlvwz1rst1_internal(
+// CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}}, ptr %{{.*}}, i64 %{{.*}})
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 0
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+// CHECK: store <256 x i32> %{{.*}}, ptr %{{.*}}, align 1024
+// CHECK: extractvalue { x86_amx, x86_amx } %{{.*}}, 1
+// CHECK: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %{{.*}})
+void test_tile_2rpntlvwz1rst1_internal(int row, int col0, int col1, void *D0, void *D1, void *B) {
+  _tile_2rpntlvwz1rst1_internal(row, col0, col1, D0, D1, B, 1);
+}

diff  --git a/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
new file mode 100755
index 00000000000000..b174cc5067bf30
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_tranpose_api.c
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-movrs  -emit-llvm -o - -Wall -Werror -pedantic \
+// RUN: -target-feature +amx-transpose -Wno-gnu-statement-expression| FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+char buf[2048];
+#define STRIDE 32
+
+void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
+  // CHECK-LABEL: @test_tile_2rpntlvwz0rs
+  // CHECK: call void @llvm.x86.t2rpntlvwz0rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
+  _tile_2rpntlvwz0rs(1, A, B);
+}
+
+void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
+  // CHECK-LABEL: @test_tile_2rpntlvwz0rst1
+  // CHECK: call void @llvm.x86.t2rpntlvwz0rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
+  _tile_2rpntlvwz0rst1(1, A, B);
+}
+
+void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
+  // CHECK-LABEL: @test_tile_2rpntlvwz1rs
+  // CHECK: call void @llvm.x86.t2rpntlvwz1rs(i8 1, ptr %{{.*}}, i64 %{{.*}})
+  _tile_2rpntlvwz1rs(1, A, B);
+}
+
+void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
+  // CHECK-LABEL: @test_tile_2rpntlvwz1rst1
+  // CHECK: call void @llvm.x86.t2rpntlvwz1rst1(i8 1, ptr %{{.*}}, i64 %{{.*}})
+  _tile_2rpntlvwz1rst1(1, A, B);
+}
+
+void test__tile_2rpntlvwz0rs(__tile1024i dst0, __tile1024i dst1) {
+  //CHECK-LABEL: @test__tile_2rpntlvwz0rs
+  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal
+  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+  __tile_2rpntlvwz0rs(&dst0, &dst1, buf, STRIDE);
+}
+
+void test__tile_2rpntlvwz0rst1(__tile1024i dst0, __tile1024i dst1) {
+  //CHECK-LABEL: @test__tile_2rpntlvwz0rst1
+  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal
+  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+  __tile_2rpntlvwz0rst1(&dst0, &dst1, buf, STRIDE);
+}
+
+void test__tile_2rpntlvwz1rs(__tile1024i dst0, __tile1024i dst1) {
+  //CHECK-LABEL: @test__tile_2rpntlvwz1rs
+  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal
+  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+  __tile_2rpntlvwz1rs(&dst0, &dst1, buf, STRIDE);
+}
+
+void test__tile_2rpntlvwz1rst1(__tile1024i dst0, __tile1024i dst1) {
+  //CHECK-LABEL: @test__tile_2rpntlvwz1rst1
+  //CHECK: call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal
+  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 0
+  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+  //CHECK-NEXT: {{%.*}} = extractvalue { x86_amx, x86_amx } {{%.*}}, 1
+  //CHECK-NEXT: {{%.*}} = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  //CHECK-NEXT: store <256 x i32> {{%.*}}, ptr {{%.*}}
+  __tile_2rpntlvwz1rst1(&dst0, &dst1, buf, STRIDE);
+}

diff  --git a/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
new file mode 100755
index 00000000000000..840b52bbb29bbf
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_movrs_transpose_errors.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-int8 -target-feature +amx-transpose -target-feature +amx-movrs \
+// RUN: -verify
+
+#include <immintrin.h>
+#include <stddef.h>
+
+void test_tile_2rpntlvwz0rs(const void *A, size_t B) {
+  _tile_2rpntlvwz0rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz0rst1(const void *A, size_t B) {
+  _tile_2rpntlvwz0rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz1rs(const void *A, size_t B) {
+  _tile_2rpntlvwz1rs(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+void test_tile_2rpntlvwz1rst1(const void *A, size_t B) {
+  _tile_2rpntlvwz1rst1(8, A, B); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}

diff  --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index b2d6f44b7927a9..fcb506e9ebbfcc 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5882,6 +5882,12 @@ let TargetPrefix = "x86" in {
   def int_x86_tilestored64 : ClangBuiltin<"__builtin_ia32_tilestored64">,
               Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
                         [ImmArg<ArgIndex<0>>]>;
+  def int_x86_tileloaddrs64 : ClangBuiltin<"__builtin_ia32_tileloaddrs64">,
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [ImmArg<ArgIndex<0>>]>;
+  def int_x86_tileloaddrst164 : ClangBuiltin<"__builtin_ia32_tileloaddrst164">,
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [ImmArg<ArgIndex<0>>]>;
   def int_x86_tdpbssd : ClangBuiltin<"__builtin_ia32_tdpbssd">,
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>,
@@ -5952,6 +5958,20 @@ let TargetPrefix = "x86" in {
               Intrinsic<[], [llvm_i8_ty, llvm_i8_ty],
                         [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
 
+  // AMX-MORVS, AMX-TRANSPOSE
+  def int_x86_t2rpntlvwz0rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rs">,
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [ImmArg<ArgIndex<0>>]>;
+  def int_x86_t2rpntlvwz0rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz0rst1">,
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [ImmArg<ArgIndex<0>>]>;
+  def int_x86_t2rpntlvwz1rs : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rs">,
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [ImmArg<ArgIndex<0>>]>;
+  def int_x86_t2rpntlvwz1rst1 : ClangBuiltin<"__builtin_ia32_t2rpntlvwz1rst1">,
+              Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [ImmArg<ArgIndex<0>>]>;
+
   // AMX-AVX512
   def int_x86_tcvtrowd2ps : ClangBuiltin<"__builtin_ia32_tcvtrowd2ps">,
               Intrinsic<[llvm_v16f32_ty], [llvm_i8_ty, llvm_i32_ty],
@@ -5986,6 +6006,16 @@ let TargetPrefix = "x86" in {
               Intrinsic<[llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
                         []>;
+  def int_x86_tileloaddrs64_internal :
+              ClangBuiltin<"__builtin_ia32_tileloaddrs64_internal">,
+              Intrinsic<[llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+                        []>;
+  def int_x86_tileloaddrst164_internal :
+              ClangBuiltin<"__builtin_ia32_tileloaddrst164_internal">,
+              Intrinsic<[llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+                        []>;
   def int_x86_tdpbssd_internal :
               ClangBuiltin<"__builtin_ia32_tdpbssd_internal">,
               Intrinsic<[llvm_x86amx_ty],
@@ -6050,6 +6080,24 @@ let TargetPrefix = "x86" in {
                          llvm_x86amx_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty], []>;
 
+  // AMX-MORVS, AMX-TRANSPOSE - internal intrinsics
+  def int_x86_t2rpntlvwz0rs_internal :
+              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [IntrArgMemOnly, IntrReadMem]>;
+  def int_x86_t2rpntlvwz0rst1_internal :
+              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [IntrArgMemOnly]>;
+  def int_x86_t2rpntlvwz1rs_internal :
+              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [IntrArgMemOnly]>;
+  def int_x86_t2rpntlvwz1rst1_internal :
+              Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
+                        [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+                        [IntrArgMemOnly]>;
+
   def int_x86_t2rpntlvwz0_internal :
               Intrinsic<[llvm_x86amx_ty, llvm_x86amx_ty],
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],

diff  --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 3b643563775688..5cad8e117f4ee2 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -266,6 +266,7 @@ X86_FEATURE       (MOVRS,           "movrs")
 X86_FEATURE       (ZU,              "zu")
 X86_FEATURE       (AMX_FP8,         "amx-fp8")
 X86_FEATURE       (AMX_TRANSPOSE,   "amx-transpose")
+X86_FEATURE       (AMX_MOVRS,       "amx-movrs")
 X86_FEATURE       (AMX_AVX512,      "amx-avx512")
 X86_FEATURE       (AMX_TF32,        "amx-tf32")
 // These features aren't really CPU features, but the frontend can set them.

diff  --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 35bbffdb20942d..509632183dc016 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -273,6 +273,9 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true",
 def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
                                      "Support AMX-FP8 instructions",
                                      [FeatureAMXTILE]>;
+def FeatureAMXMOVRS : SubtargetFeature<"amx-movrs", "HasAMXMOVRS", "true",
+                                       "Support AMX-MOVRS instructions",
+                                       [FeatureAMXTILE]>;
 def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "true",
                                            "Support AMX amx-transpose instructions",
                                            [FeatureAMXTILE]>;

diff  --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 4f045d78f75fb2..73ca4b09c0aa53 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -559,6 +559,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   }
   case X86::PTILELOADDV:
   case X86::PTILELOADDT1V:
+  case X86::PTILELOADDRSV:
+  case X86::PTILELOADDRST1V:
   case X86::PTCVTROWD2PSrreV:
   case X86::PTCVTROWD2PSrriV:
   case X86::PTCVTROWPS2PBF16HrreV:
@@ -575,6 +577,12 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
       MI.removeOperand(i);
     unsigned Opc;
     switch (Opcode) {
+    case X86::PTILELOADDRSV:
+      Opc = X86::TILELOADDRS;
+      break;
+    case X86::PTILELOADDRST1V:
+      Opc = X86::TILELOADDRST1;
+      break;
     case X86::PTILELOADDV:
       Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
       break;
@@ -719,7 +727,11 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::PT2RPNTLVWZ0V:
   case X86::PT2RPNTLVWZ0T1V:
   case X86::PT2RPNTLVWZ1V:
-  case X86::PT2RPNTLVWZ1T1V: {
+  case X86::PT2RPNTLVWZ1T1V:
+  case X86::PT2RPNTLVWZ0RSV:
+  case X86::PT2RPNTLVWZ0RST1V:
+  case X86::PT2RPNTLVWZ1RSV:
+  case X86::PT2RPNTLVWZ1RST1V: {
     for (unsigned i = 3; i > 0; --i)
       MI.removeOperand(i);
     unsigned Opc;
@@ -736,6 +748,18 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     case X86::PT2RPNTLVWZ1T1V:
       Opc = X86::T2RPNTLVWZ1T1;
       break;
+    case X86::PT2RPNTLVWZ0RSV:
+      Opc = X86::T2RPNTLVWZ0RS;
+      break;
+    case X86::PT2RPNTLVWZ0RST1V:
+      Opc = X86::T2RPNTLVWZ0RST1;
+      break;
+    case X86::PT2RPNTLVWZ1RSV:
+      Opc = X86::T2RPNTLVWZ1RS;
+      break;
+    case X86::PT2RPNTLVWZ1RST1V:
+      Opc = X86::T2RPNTLVWZ1RST1;
+      break;
     default:
       llvm_unreachable("Impossible Opcode!");
     }

diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8ad8641d1de485..72de0e0e8761f2 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -337,6 +337,10 @@ namespace {
       case X86::PT2RPNTLVWZ0T1V:
       case X86::PT2RPNTLVWZ1V:
       case X86::PT2RPNTLVWZ1T1V:
+      case X86::PT2RPNTLVWZ0RSV:
+      case X86::PT2RPNTLVWZ0RST1V:
+      case X86::PT2RPNTLVWZ1RSV:
+      case X86::PT2RPNTLVWZ1RST1V:
         return true;
       }
       for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
@@ -5157,6 +5161,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, Res);
       return;
     }
+    case Intrinsic::x86_tileloaddrs64_internal:
+    case Intrinsic::x86_tileloaddrst164_internal:
+      if (!Subtarget->hasAMXMOVRS())
+        break;
+      [[fallthrough]];
     case Intrinsic::x86_tileloadd64_internal:
     case Intrinsic::x86_tileloaddt164_internal: {
       if (!Subtarget->hasAMXTILE())
@@ -5164,9 +5173,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       auto *MFI =
           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
       MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
-      unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
-                         ? X86::PTILELOADDV
-                         : X86::PTILELOADDT1V;
+      unsigned Opc;
+      switch (IntNo) {
+      default:
+        llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_tileloaddrs64_internal:
+        Opc = X86::PTILELOADDRSV;
+        break;
+      case Intrinsic::x86_tileloaddrst164_internal:
+        Opc = X86::PTILELOADDRST1V;
+        break;
+      case Intrinsic::x86_tileloadd64_internal:
+        Opc = X86::PTILELOADDV;
+        break;
+      case Intrinsic::x86_tileloaddt164_internal:
+        Opc = X86::PTILELOADDT1V;
+        break;
+      }
       // _tile_loadd_internal(row, col, buf, STRIDE)
       SDValue Base = Node->getOperand(4);
       SDValue Scale = getI8Imm(1, dl);
@@ -5270,6 +5293,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, CNode);
       return;
     }
+    case Intrinsic::x86_tileloaddrs64:
+    case Intrinsic::x86_tileloaddrst164:
+      if (!Subtarget->hasAMXMOVRS())
+        break;
+      [[fallthrough]];
     case Intrinsic::x86_tileloadd64:
     case Intrinsic::x86_tileloaddt164:
     case Intrinsic::x86_tilestored64: {
@@ -5282,7 +5310,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       switch (IntNo) {
       default: llvm_unreachable("Unexpected intrinsic!");
       case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
+      case Intrinsic::x86_tileloaddrs64:
+        Opc = X86::PTILELOADDRS;
+        break;
       case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
+      case Intrinsic::x86_tileloaddrst164:
+        Opc = X86::PTILELOADDRST1;
+        break;
       case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
       }
       // FIXME: Match displacement and scale.
@@ -5305,6 +5339,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, CNode);
       return;
     }
+    case Intrinsic::x86_t2rpntlvwz0rs:
+    case Intrinsic::x86_t2rpntlvwz0rst1:
+    case Intrinsic::x86_t2rpntlvwz1rs:
+    case Intrinsic::x86_t2rpntlvwz1rst1:
+      if (!Subtarget->hasAMXMOVRS())
+        break;
+      [[fallthrough]];
     case Intrinsic::x86_t2rpntlvwz0:
     case Intrinsic::x86_t2rpntlvwz0t1:
     case Intrinsic::x86_t2rpntlvwz1:
@@ -5330,6 +5371,18 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       case Intrinsic::x86_t2rpntlvwz1t1:
         Opc = X86::PT2RPNTLVWZ1T1;
         break;
+      case Intrinsic::x86_t2rpntlvwz0rs:
+        Opc = X86::PT2RPNTLVWZ0RS;
+        break;
+      case Intrinsic::x86_t2rpntlvwz0rst1:
+        Opc = X86::PT2RPNTLVWZ0RST1;
+        break;
+      case Intrinsic::x86_t2rpntlvwz1rs:
+        Opc = X86::PT2RPNTLVWZ1RS;
+        break;
+      case Intrinsic::x86_t2rpntlvwz1rst1:
+        Opc = X86::PT2RPNTLVWZ1RST1;
+        break;
       }
       // FIXME: Match displacement and scale.
       unsigned TIndex = Node->getConstantOperandVal(2);
@@ -5340,9 +5393,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
       SDValue Chain = Node->getOperand(0);
-      MachineSDNode *CNode;
       SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
-      CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+      MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
       ReplaceNode(Node, CNode);
       return;
     }

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c08efc9f7271e6..db04f3a48d4d03 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27345,6 +27345,10 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
     }
+    case Intrinsic::x86_t2rpntlvwz0rs_internal:
+    case Intrinsic::x86_t2rpntlvwz0rst1_internal:
+    case Intrinsic::x86_t2rpntlvwz1rs_internal:
+    case Intrinsic::x86_t2rpntlvwz1rst1_internal:
     case Intrinsic::x86_t2rpntlvwz0_internal:
     case Intrinsic::x86_t2rpntlvwz0t1_internal:
     case Intrinsic::x86_t2rpntlvwz1_internal:
@@ -27368,6 +27372,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       case Intrinsic::x86_t2rpntlvwz1t1_internal:
         Opc = X86::PT2RPNTLVWZ1T1V;
         break;
+      case Intrinsic::x86_t2rpntlvwz0rs_internal:
+        Opc = X86::PT2RPNTLVWZ0RSV;
+        break;
+      case Intrinsic::x86_t2rpntlvwz0rst1_internal:
+        Opc = X86::PT2RPNTLVWZ0RST1V;
+        break;
+      case Intrinsic::x86_t2rpntlvwz1rs_internal:
+        Opc = X86::PT2RPNTLVWZ1RSV;
+        break;
+      case Intrinsic::x86_t2rpntlvwz1rst1_internal:
+        Opc = X86::PT2RPNTLVWZ1RST1V;
+        break;
       }
 
       SDLoc DL(Op);
@@ -37533,6 +37549,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
     return BB;
   }
+  case X86::PTILELOADDRS:
+  case X86::PTILELOADDRST1:
   case X86::PTILELOADD:
   case X86::PTILELOADDT1:
   case X86::PTILESTORED: {
@@ -37550,6 +37568,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
       break;
 #undef GET_EGPR_IF_ENABLED
+    case X86::PTILELOADDRS:
+      Opc = X86::TILELOADDRS;
+      break;
+    case X86::PTILELOADDRST1:
+      Opc = X86::TILELOADDRST1;
+      break;
     }
 
     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
@@ -37590,6 +37614,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
+  case X86::PT2RPNTLVWZ0RS:
+  case X86::PT2RPNTLVWZ0RST1:
+  case X86::PT2RPNTLVWZ1RS:
+  case X86::PT2RPNTLVWZ1RST1:
   case X86::PT2RPNTLVWZ0:
   case X86::PT2RPNTLVWZ0T1:
   case X86::PT2RPNTLVWZ1:
@@ -37611,6 +37639,18 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::PT2RPNTLVWZ1T1:
       Opc = X86::T2RPNTLVWZ1T1;
       break;
+    case X86::PT2RPNTLVWZ0RS:
+      Opc = X86::T2RPNTLVWZ0RS;
+      break;
+    case X86::PT2RPNTLVWZ0RST1:
+      Opc = X86::T2RPNTLVWZ0RST1;
+      break;
+    case X86::PT2RPNTLVWZ1RS:
+      Opc = X86::T2RPNTLVWZ1RS;
+      break;
+    case X86::PT2RPNTLVWZ1RST1:
+      Opc = X86::T2RPNTLVWZ1RST1;
+      break;
     }
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
     MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);

diff  --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 04527716e31627..059bfb4c70bcf9 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -370,6 +370,70 @@ let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
   }
 } // HasAMXTILE, HasAMXTRANSPOSE
 
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
+  def T2RPNTLVWZ0RS   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                        (ins sibmem:$src1),
+                        "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}",
+                        []>, VEX, T_MAP5;
+  def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                        (ins sibmem:$src1),
+                        "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}",
+                        []>, VEX, T_MAP5;
+  def T2RPNTLVWZ1RS   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                        (ins sibmem:$src1),
+                        "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}",
+                        []>, VEX, T_MAP5, PD;
+  def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
+                        (ins sibmem:$src1),
+                        "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}",
+                        []>, VEX, T_MAP5, PD;
+  let isPseudo = true in {
+    def PT2RPNTLVWZ0RSV   : PseudoI<(outs TILEPair:$dst),
+                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+                              []>;
+    def PT2RPNTLVWZ0RST1V : PseudoI<(outs TILEPair:$dst),
+                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+                              []>;
+    def PT2RPNTLVWZ1RSV   : PseudoI<(outs TILEPair:$dst),
+                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+                              []>;
+    def PT2RPNTLVWZ1RST1V : PseudoI<(outs TILEPair:$dst),
+                              (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
+                              []>;
+  }
+  let  usesCustomInserter = 1 in {
+    def PT2RPNTLVWZ0RS   : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+    def PT2RPNTLVWZ0RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+    def PT2RPNTLVWZ1RS   : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+    def PT2RPNTLVWZ1RST1 : PseudoI<(outs), (ins u8imm:$dst, sibmem:$src1), []>;
+  }
+} // HasAMXMOVRS, HasAMXTRANSPOSE
+
+let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
+  def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+                    (ins sibmem:$src1),
+                    "tileloaddrs\t{$src1, $dst|$dst, $src1}",
+                    []>, VEX, T8, XD;
+  def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
+                    (ins sibmem:$src1),
+                    "tileloaddrst1\t{$src1, $dst|$dst, $src1}",
+                    []>, VEX, T8, PD;
+
+  let isPseudo = true, mayLoad = 1 in {
+    def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+                                                  GR16:$src2,
+                                                  opaquemem:$src3), []>;
+    def PTILELOADDRST1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+                                                    GR16:$src2,
+                                                    opaquemem:$src3), []>;
+  }
+
+  let usesCustomInserter = 1, mayLoad = 1 in {
+    def PTILELOADDRS : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+    def PTILELOADDRST1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+  }
+} // HasAMXMOVRS, In64BitMode
+
 multiclass m_tcvtrowd2ps {
   let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
     let SchedRW = [WriteSystem] in {

diff  --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index a9ec5f660ff1d8..eb2e93a94b197c 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -184,6 +184,7 @@ def HasAMXBF16   : Predicate<"Subtarget->hasAMXBF16()">;
 def HasAMXINT8   : Predicate<"Subtarget->hasAMXINT8()">;
 def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
 def HasAMXFP8    : Predicate<"Subtarget->hasAMXFP8()">;
+def HasAMXMOVRS  : Predicate<"Subtarget->hasAMXMOVRS()">;
 def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">;
 def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">;
 def HasAMXTF32   : Predicate<"Subtarget->hasAMXTF32()">;

diff  --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 0e74cfa75e9606..3dc69d4ecb0201 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -227,7 +227,13 @@ std::pair<Value *, Value *> ShapeCalculator::getShape(IntrinsicInst *II,
   case Intrinsic::x86_t2rpntlvwz1t1_internal:
   case Intrinsic::x86_tileloadd64_internal:
   case Intrinsic::x86_tileloaddt164_internal:
-  case Intrinsic::x86_tilestored64_internal: {
+  case Intrinsic::x86_tilestored64_internal:
+  case Intrinsic::x86_t2rpntlvwz0rs_internal:
+  case Intrinsic::x86_t2rpntlvwz0rst1_internal:
+  case Intrinsic::x86_t2rpntlvwz1rs_internal:
+  case Intrinsic::x86_t2rpntlvwz1rst1_internal:
+  case Intrinsic::x86_tileloaddrs64_internal:
+  case Intrinsic::x86_tileloaddrst164_internal: {
     Row = II->getArgOperand(0);
     Col = II->getArgOperand(1);
     break;

diff  --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 09418c9bb74d34..08f6f43c72638b 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1077,6 +1077,8 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
   case X86::PTCMMIMFP16PSV:
   case X86::PTCMMRLFP16PSV:
   case X86::PTTRANSPOSEDV:
+  case X86::PTILELOADDRSV:
+  case X86::PTILELOADDRST1V:
   case X86::PTMMULTF32PSV:
   case X86::PTTMMULTF32PSV: {
     MachineOperand &MO1 = MI->getOperand(1);
@@ -1088,7 +1090,11 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
   case X86::PT2RPNTLVWZ0V:
   case X86::PT2RPNTLVWZ0T1V:
   case X86::PT2RPNTLVWZ1V:
-  case X86::PT2RPNTLVWZ1T1V: {
+  case X86::PT2RPNTLVWZ1T1V:
+  case X86::PT2RPNTLVWZ0RSV:
+  case X86::PT2RPNTLVWZ0RST1V:
+  case X86::PT2RPNTLVWZ1RSV:
+  case X86::PT2RPNTLVWZ1RST1V: {
     MachineOperand &MO1 = MI->getOperand(1);
     MachineOperand &MO2 = MI->getOperand(2);
     MachineOperand &MO3 = MI->getOperand(3);

diff  --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 140e565e1686f2..58ba2553633221 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1882,6 +1882,7 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave;
   Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave;
   Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave;
+  Features["amx-movrs"] = HasLeaf1E && ((EAX >> 8) & 1) && HasAMXSave;
 
   bool HasLeaf24 =
       MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);

diff  --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 6b53424833bd47..0da740743c9b7c 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -600,6 +600,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
+constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 =
     FeatureAMX_TILE | FeatureAVX10_2_512;
 constexpr FeatureBitset ImpliedFeaturesAMX_TF32 = FeatureAMX_TILE;

diff  --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
new file mode 100755
index 00000000000000..da212a1850964e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s
+
+define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
+; CHECK-LABEL: test_amx_internal:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
+; CHECK-NEXT:    subq $3072, %rsp # imm = 0xC00
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK-NEXT:    movw %ax, %cx
+; CHECK-NEXT:    movw %di, %ax
+; CHECK-NEXT:    # implicit-def: $al
+; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    tileloaddrs (%rdx,%rsi), %tmm0
+; CHECK-NEXT:    movl $64, %esi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rsi)
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    retq
+entry:
+  %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
+  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
+  ret void
+}
+declare x86_amx @llvm.x86.tileloaddrs64.internal(i16, i16, ptr, i64)
+
+define void @test_amx_old(i16 %m, i16 %n, ptr %buf) {
+; CHECK-LABEL: test_amx_old:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    tileloaddrs (%rdx,%rax), %tmm2
+; CHECK-NEXT:    retq
+entry:
+  call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32)
+  ret void
+}
+declare void @llvm.x86.tileloaddrs64(i8 immarg, ptr, i64)
+
+define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
+; CHECK-LABEL: test_amx_t1_internal:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
+; CHECK-NEXT:    subq $3072, %rsp # imm = 0xC00
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK-NEXT:    movw %ax, %cx
+; CHECK-NEXT:    movw %di, %ax
+; CHECK-NEXT:    # implicit-def: $al
+; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    tileloaddrst1 (%rdx,%rsi), %tmm0
+; CHECK-NEXT:    movl $64, %esi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rsi)
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    retq
+entry:
+  %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
+  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
+  ret void
+}
+declare x86_amx @llvm.x86.tileloaddrst164.internal(i16, i16, ptr, i64)
+
+define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) {
+; CHECK-LABEL: test_amx_t1_old:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl $32, %eax
+; CHECK-NEXT:    tileloaddrst1 (%rdx,%rax), %tmm2
+; CHECK-NEXT:    retq
+entry:
+  call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32)
+  ret void
+}
+declare void @llvm.x86.tileloaddrst164(i8 immarg, ptr, i64)

diff  --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
new file mode 100755
index 00000000000000..146b69773eb186
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
+; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
+
+define void @test_amx(i64 %stride, i8* %addr1) #0 {
+; CHECK-LABEL: test_amx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    t2rpntlvwz0rs (%rsi,%rdi), %tmm0
+; CHECK-NEXT:    t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2
+; CHECK-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0
+; CHECK-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
+; CHECK-NEXT:    retq
+  call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
+  call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
+  call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
+  call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride)
+  ret void
+}
+declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 )
+declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 )
+declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 )
+declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 )
+
+define void @test_amx2(i8* %base, i64 %stride) #0 {
+; O0-LABEL: test_amx2:
+; O0:       # %bb.0:
+; O0-NEXT:    xorps %xmm0, %xmm0
+; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movw $8, %ax
+; O0-NEXT:    # implicit-def: $al
+; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    # implicit-def: $al
+; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; O0-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
+; O0-NEXT:    movw $8, %ax
+; O0-NEXT:    # implicit-def: $al
+; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    # implicit-def: $al
+; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; O0-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4
+; O0-NEXT:    movw $8, %ax
+; O0-NEXT:    # implicit-def: $al
+; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    # implicit-def: $al
+; O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; O0-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; O0-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
+; O0-NEXT:    tilerelease
+; O0-NEXT:    retq
+;
+; O2-LABEL: test_amx2:
+; O2:       # %bb.0:
+; O2-NEXT:    xorps %xmm0, %xmm0
+; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; O2-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; O2-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; O2-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; O2-NEXT:    movw $8, %ax
+; O2-NEXT:    t2rpntlvwz0rs (%rdi,%rsi), %tmm4
+; O2-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
+; O2-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4
+; O2-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
+; O2-NEXT:    tilerelease
+; O2-NEXT:    retq
+  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+  call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
+  ret void
+}
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64)
+declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64)

diff  --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
new file mode 100755
index 00000000000000..6df44c87d2332f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
@@ -0,0 +1,98 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s -check-prefix=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s -check-prefix=INTEL
+
+# ATT:   t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0rs 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [rbx + 64]
+0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40
+
+# ATT:   t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
+0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0rst1 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [rbx + 64]
+0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40
+
+# ATT:   t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1rs 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [rbx + 64]
+0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40
+
+# ATT:   t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
+0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
+0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1rst1 64(%rbx), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [rbx + 64]
+0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40
+
+# ATT:   t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   tileloaddrs 268435456(%rbp,%r14,8), %tmm6
+# INTEL: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   tileloaddrs 291(%r8,%rax,4), %tmm3
+# INTEL: tileloaddrs tmm3, [r8 + 4*rax + 291]
+0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddrs 64(%rbx), %tmm6
+# INTEL: tileloaddrs tmm6, [rbx + 64]
+0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40
+
+# ATT:   tileloaddrs -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrs tmm3, [2*rbp - 32]
+0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   tileloaddrst1 268435456(%rbp,%r14,8), %tmm6
+# INTEL: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   tileloaddrst1 291(%r8,%rax,4), %tmm3
+# INTEL: tileloaddrst1 tmm3, [r8 + 4*rax + 291]
+0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddrst1 64(%rbx), %tmm6
+# INTEL: tileloaddrst1 tmm6, [rbx + 64]
+0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40
+
+# ATT:   tileloaddrst1 -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
+0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff

diff  --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
new file mode 100755
index 00000000000000..d780ad4f0e3691
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
@@ -0,0 +1,89 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rs 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rs 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz0rs 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
+          t2rpntlvwz0rs 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rst1 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rst1 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz0rst1 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
+          t2rpntlvwz0rst1 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rs 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rs 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz1rs 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
+          t2rpntlvwz1rs 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rst1 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rst1 291(%r8,%rax,4), %tmm2
+
+// CHECK: t2rpntlvwz1rst1 64(%rbx), %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
+          t2rpntlvwz1rst1 64(%rbx), %tmm6
+
+// CHECK: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: tileloaddrs 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          tileloaddrs 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: tileloaddrs 291(%r8,%rax,4), %tmm3
+// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrs 291(%r8,%rax,4), %tmm3
+
+// CHECK: tileloaddrs -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          tileloaddrs -32(,%rbp,2), %tmm3
+
+// CHECK: tileloaddrst1 268435456(%rbp,%r14,8), %tmm6
+// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          tileloaddrst1 268435456(%rbp,%r14,8), %tmm6
+
+// CHECK: tileloaddrst1 291(%r8,%rax,4), %tmm3
+// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrst1 291(%r8,%rax,4), %tmm3
+
+// CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          tileloaddrst1 -32(,%rbp,2), %tmm3
\ No newline at end of file

diff  --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
new file mode 100755
index 00000000000000..ccc7ac51a98a44
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
@@ -0,0 +1,97 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding  %s | FileCheck %s
+
+// CHECK: t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rs tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rs tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz0rs tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x74,0x23,0x40]
+          t2rpntlvwz0rs tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz0rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          t2rpntlvwz0rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x78,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rst1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x78,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rst1 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x74,0x23,0x40]
+          t2rpntlvwz0rst1 tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf8,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rs tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf8,0x94,0x80,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rs tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz1rs tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x74,0x23,0x40]
+          t2rpntlvwz1rs tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz1rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          t2rpntlvwz1rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa5,0x79,0xf9,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rst1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc5,0x79,0xf9,0x94,0x80,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rst1 tmm2, [r8 + 4*rax + 291]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x74,0x23,0x40]
+          t2rpntlvwz1rst1 tmm6, [rbx + 64]
+
+// CHECK: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+
+// CHECK: tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x7b,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          tileloaddrs tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: tileloaddrs tmm3, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x7b,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrs tmm3, [r8 + 4*rax + 291]
+
+// CHECK: tileloaddrs tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x74,0x23,0x40]
+          tileloaddrs tmm6, [rbx + 64]
+
+// CHECK: tileloaddrs tmm3, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x7b,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          tileloaddrs tmm3, [2*rbp - 32]
+
+// CHECK: tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x79,0x4a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          tileloaddrst1 tmm6, [rbp + 8*r14 + 268435456]
+
+// CHECK: tileloaddrst1 tmm3, [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x79,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrst1 tmm3, [r8 + 4*rax + 291]
+
+// CHECK: tileloaddrst1 tmm6, [rbx + 64]
+// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x74,0x23,0x40]
+          tileloaddrst1 tmm6, [rbx + 64]
+
+// CHECK: tileloaddrst1 tmm3, [2*rbp - 32]
+// CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          tileloaddrst1 tmm3, [2*rbp - 32]


        


More information about the cfe-commits mailing list