[Mlir-commits] [mlir] [mlir][amdgpu] Add amdgpu.make_dma_descriptor (PR #169407)

Erick Ochoa Lopez llvmlistbot at llvm.org
Mon Nov 24 14:53:24 PST 2025


https://github.com/amd-eochoalo updated https://github.com/llvm/llvm-project/pull/169407

>From a4a1a59d894aae479a1bd5aebe2705431b6588b5 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Fri, 21 Nov 2025 12:56:29 -0500
Subject: [PATCH 01/13] [mlir][amdgpu] Add make_dma_base operation

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 55 +++++++++++++++++++
 .../mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h    |  4 ++
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |  7 +++
 mlir/test/Dialect/AMDGPU/ops.mlir             | 12 ++++
 4 files changed, 78 insertions(+)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 4820b7a747ac2..04043f47c3539 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -33,6 +33,7 @@ def AMDGPU_Dialect : Dialect {
     "gpu::GPUDialect"
   ];
   let useDefaultAttributePrinterParser = 1;
+  let useDefaultTypePrinterParser = 1;
 }
 
 def AnyIntegerOrFloat : AnyTypeOf<[AnySignlessInteger, AnyFloat], "Integer or Float">;
@@ -79,6 +80,36 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
   let assemblyFormat = "`<` $value `>`";
 }
 
+class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
+    : TypeDef<AMDGPU_Dialect, name, traits> {
+  let mnemonic = typeMnemonic;
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Type definitions
+//===----------------------------------------------------------------------===//
+
+def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
+  // TODO:
+  // * Add verifiers such that one of the memrefs is from LDS and the other global.
+  // * Add verifiers to make sure that the type is in the correct direction.
+  // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
+
+  let summary = "Pair of base addresses that move data between LDS and global storage.";
+  let description = [{
+    This type is opaque and it is used to represent a struct of two addresses.
+    One address is in LDS while the other is in global memory.
+  }];
+  let parameters = (ins "Type":$elementType);
+  let builders = [
+    TypeBuilderWithInferredContext<(ins "Type":$elementType), [{
+      return $_get(elementType.getContext(), elementType);
+    }]>
+  ];
+  let assemblyFormat = "`<` $elementType `>`";
+
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPU Op definitions
 //===----------------------------------------------------------------------===//
@@ -1192,4 +1223,28 @@ def AMDGPU_ScaledMFMAOp :
   }];
   let hasCanonicalizer = 1;
 }
+
+def AMDGPU_MakeDmaBaseOp :
+    AMDGPU_Op<"make_dma_base", [AttrSizedOperandSegments]>,
+    Arguments<(ins
+                   Arg<AnyMemRef, "buffer to read from", [MemRead]>:$src,
+                   Variadic<Index>:$srcIndices,
+                   Arg<AnyMemRef, "buffer to write to", [MemWrite]>:$dst,
+                   Variadic<Index>:$dstIndices)>,
+    Results<(outs AMDGPU_TDMBaseType: $base)> {
+
+  let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
+  let description = [{
+    This operation creates a pair of addresses that will be used by tensor_load_to_lds
+    and tensor_store_from_lds.
+
+    This operation creates a value corresponding roughly to the descriptor group 0
+    found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.
+  }];
+
+  let assemblyFormat = [{
+    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
+  }];
+}
+
 #endif // AMDGPU
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
index dcd9f95a7561f..a7680fb5c3191 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
@@ -25,6 +25,7 @@
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h.inc"
 
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.h.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUTypes.h.inc"
 
 namespace mlir::amdgpu {
 /// Parser for the `custom<MNKDimensionList>` custom assembly format used by
@@ -52,6 +53,9 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *,
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.h.inc"
 
+#define GET_TYPEDEF_CLASSES
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUTypes.h.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/AMDGPU/IR/AMDGPU.h.inc"
 
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index d55f3cec47c1f..cdc10c60a42ae 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -55,6 +55,10 @@ void AMDGPUDialect::initialize() {
 #define GET_OP_LIST
 #include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc"
       >();
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUTypes.cpp.inc"
+      >();
   addAttributes<
 #define GET_ATTRDEF_LIST
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc"
@@ -839,5 +843,8 @@ void ScaledMFMAOp::getCanonicalizationPatterns(RewritePatternSet &results,
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc"
 
+#define GET_TYPEDEF_CLASSES
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUTypes.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc"
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 09134cb4704bb..653f9f64d24f4 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -685,3 +685,15 @@ func.func @memory_counter_wait() {
   amdgpu.memory_counter_wait exp(4)
   func.return
 }
+
+// CHECK-LABEL: func @make_dma_base
+// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space<workgroup>>)
+func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space<workgroup>>) {
+  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
+  amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
+
+  // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
+  amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
+  func.return
+}
+

>From d14f3e28cc79774adb744ae6ee6d98684f120fa7 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 09:18:26 -0500
Subject: [PATCH 02/13] Remove MemRead and MemWrite from operation

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 04043f47c3539..990d377dc9d7b 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1227,9 +1227,9 @@ def AMDGPU_ScaledMFMAOp :
 def AMDGPU_MakeDmaBaseOp :
     AMDGPU_Op<"make_dma_base", [AttrSizedOperandSegments]>,
     Arguments<(ins
-                   Arg<AnyMemRef, "buffer to read from", [MemRead]>:$src,
+                   Arg<AnyMemRef, "buffer to read from">:$src,
                    Variadic<Index>:$srcIndices,
-                   Arg<AnyMemRef, "buffer to write to", [MemWrite]>:$dst,
+                   Arg<AnyMemRef, "buffer to write to">:$dst,
                    Variadic<Index>:$dstIndices)>,
     Results<(outs AMDGPU_TDMBaseType: $base)> {
 

>From d3ca18c937218a8f115e58b0a6d4d5b10bdc187a Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 09:20:38 -0500
Subject: [PATCH 03/13] Add Pure to make_dma_base

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 990d377dc9d7b..645fc4655025a 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1225,7 +1225,7 @@ def AMDGPU_ScaledMFMAOp :
 }
 
 def AMDGPU_MakeDmaBaseOp :
-    AMDGPU_Op<"make_dma_base", [AttrSizedOperandSegments]>,
+    AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
     Arguments<(ins
                    Arg<AnyMemRef, "buffer to read from">:$src,
                    Variadic<Index>:$srcIndices,

>From 76e47f147ea84ec13d0a0afac5d5d2b963b9b49f Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 12:11:43 -0500
Subject: [PATCH 04/13] Add DynamicIndexList

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 44 +++++++++++++++----
 mlir/test/Dialect/AMDGPU/ops.mlir             |  8 ++++
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 645fc4655025a..e2fd78dab7ebf 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -80,21 +80,17 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
   let assemblyFormat = "`<` $value `>`";
 }
 
-class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
-    : TypeDef<AMDGPU_Dialect, name, traits> {
-  let mnemonic = typeMnemonic;
-}
 
 //===----------------------------------------------------------------------===//
 // AMDGPU Type definitions
 //===----------------------------------------------------------------------===//
 
-def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
-  // TODO:
-  // * Add verifiers such that one of the memrefs is from LDS and the other global.
-  // * Add verifiers to make sure that the type is in the correct direction.
-  // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
+class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
+    : TypeDef<AMDGPU_Dialect, name, traits> {
+  let mnemonic = typeMnemonic;
+}
 
+def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
   let summary = "Pair of base addresses that move data between LDS and global storage.";
   let description = [{
     This type is opaque and it is used to represent a struct of two addresses.
@@ -107,6 +103,14 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
     }]>
   ];
   let assemblyFormat = "`<` $elementType `>`";
+}
+
+def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
+  let summary = "Descriptors used in tensor store/load operations.";
+  let description = [{
+    This type is opaque and corresponds to the two or four descriptor groups
+    used in tensor_load_to_lds or tensor_store_from_lds.
+  }];
 
 }
 
@@ -1233,6 +1237,10 @@ def AMDGPU_MakeDmaBaseOp :
                    Variadic<Index>:$dstIndices)>,
     Results<(outs AMDGPU_TDMBaseType: $base)> {
 
+  // TODO:
+  // * Add verifiers such that one of the memrefs is from LDS and the other global.
+  // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
+
   let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
   let description = [{
     This operation creates a pair of addresses that will be used by tensor_load_to_lds
@@ -1247,4 +1255,22 @@ def AMDGPU_MakeDmaBaseOp :
   }];
 }
 
+def AMDGPU_MakeDmaDescriptorOp :
+  AMDGPU_Op<"make_dma_descriptor", [Pure]>,
+  Arguments<(ins
+    AMDGPU_TDMBaseType: $base,
+    Variadic<Index>: $dynamic_sizes,
+    OptionalAttr<DenseI64ArrayAttr>: $static_sizes)>,
+  Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
+
+  let summary = "TODO";
+  let description = [{
+    TODO
+  }];
+
+  let assemblyFormat = [{
+    $base `globalSize` custom<DynamicIndexList>($dynamic_sizes, $static_sizes) attr-dict `:` qualified(type($base)) `to` type(results)
+  }];
+}
+
 #endif // AMDGPU
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 653f9f64d24f4..818fd1afa2dc5 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -697,3 +697,11 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
   func.return
 }
 
+// CHECK-LABEL: func @make_dma_descriptor
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) {
+  // CHECK: amdgpu.make_dma_descriptor %[[BASE]] globalSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+  amdgpu.make_dma_descriptor %base globalSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+  func.return
+}
+

>From f6f67e39b85c97c39445fa436462c2da916dec40 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 14:10:02 -0500
Subject: [PATCH 05/13] Add globalStride

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 11 ++++++++---
 mlir/test/Dialect/AMDGPU/ops.mlir             |  4 ++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index e2fd78dab7ebf..b08039064adff 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1256,11 +1256,13 @@ def AMDGPU_MakeDmaBaseOp :
 }
 
 def AMDGPU_MakeDmaDescriptorOp :
-  AMDGPU_Op<"make_dma_descriptor", [Pure]>,
+  AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
   Arguments<(ins
     AMDGPU_TDMBaseType: $base,
     Variadic<Index>: $dynamic_sizes,
-    OptionalAttr<DenseI64ArrayAttr>: $static_sizes)>,
+    OptionalAttr<DenseI64ArrayAttr>: $static_sizes,
+    Variadic<Index>: $dynamic_strides,
+    OptionalAttr<DenseI64ArrayAttr>: $static_strides)>,
   Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
 
   let summary = "TODO";
@@ -1269,7 +1271,10 @@ def AMDGPU_MakeDmaDescriptorOp :
   }];
 
   let assemblyFormat = [{
-    $base `globalSize` custom<DynamicIndexList>($dynamic_sizes, $static_sizes) attr-dict `:` qualified(type($base)) `to` type(results)
+    $base
+	`globalSize` custom<DynamicIndexList>($dynamic_sizes, $static_sizes)
+	`globalStride` custom<DynamicIndexList>($dynamic_strides, $static_strides)
+	attr-dict `:` qualified(type($base)) `to` type(results)
   }];
 }
 
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 818fd1afa2dc5..a36f59718f175 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -700,8 +700,8 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
 // CHECK-LABEL: func @make_dma_descriptor
 // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
 func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) {
-  // CHECK: amdgpu.make_dma_descriptor %[[BASE]] globalSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
-  amdgpu.make_dma_descriptor %base globalSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+  // CHECK: amdgpu.make_dma_descriptor %[[BASE]] globalSize [0] globalStride [1] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+  amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
   func.return
 }
 

>From 1e2668c8c4dfcf5588c49bfeae1a65be2ae15a98 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 14:25:24 -0500
Subject: [PATCH 06/13] Add verifier for innermost dimension

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 14 ++++++++------
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  | 11 +++++++++++
 mlir/test/Dialect/AMDGPU/invalid.mlir         | 10 ++++++++++
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index b08039064adff..e0a356533144d 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1259,10 +1259,10 @@ def AMDGPU_MakeDmaDescriptorOp :
   AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
   Arguments<(ins
     AMDGPU_TDMBaseType: $base,
-    Variadic<Index>: $dynamic_sizes,
-    OptionalAttr<DenseI64ArrayAttr>: $static_sizes,
-    Variadic<Index>: $dynamic_strides,
-    OptionalAttr<DenseI64ArrayAttr>: $static_strides)>,
+    Variadic<Index>: $global_dynamic_sizes,
+    OptionalAttr<DenseI64ArrayAttr>: $global_static_sizes,
+    Variadic<Index>: $global_dynamic_strides,
+    OptionalAttr<DenseI64ArrayAttr>: $global_static_strides)>,
   Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
 
   let summary = "TODO";
@@ -1272,10 +1272,12 @@ def AMDGPU_MakeDmaDescriptorOp :
 
   let assemblyFormat = [{
     $base
-	`globalSize` custom<DynamicIndexList>($dynamic_sizes, $static_sizes)
-	`globalStride` custom<DynamicIndexList>($dynamic_strides, $static_strides)
+	`globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
+	`globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
 	attr-dict `:` qualified(type($base)) `to` type(results)
   }];
+
+  let hasVerifier = 1;
 }
 
 #endif // AMDGPU
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index cdc10c60a42ae..4ade1164317af 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -705,6 +705,17 @@ LogicalResult TransposeLoadOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// MakeDmaDescriptorOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MakeDmaDescriptorOp::verify() {
+  if (getGlobalStaticStrides()->back() != 1) {
+    return emitOpError("strides for the innermost dimension must be 1.");
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ScaledMFMAOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 61fdf29a78cbd..f820060d2c718 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -354,3 +354,13 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x
   %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
   func.return %0 : vector<16xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+func.func @make_dma_descriptor_invalid_strides(%base: !amdgpu.tdm_base<i32>) {
+  // expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}}
+  amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+  func.return
+}

>From f1df3c5b9722cae7000d2c9584345befd7827dc9 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 14:32:44 -0500
Subject: [PATCH 07/13] Add sharedSize

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  5 ++++-
 mlir/test/Dialect/AMDGPU/invalid.mlir         |  2 +-
 mlir/test/Dialect/AMDGPU/ops.mlir             | 10 ++++++++--
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index e0a356533144d..16ef34d1486cb 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1262,7 +1262,9 @@ def AMDGPU_MakeDmaDescriptorOp :
     Variadic<Index>: $global_dynamic_sizes,
     OptionalAttr<DenseI64ArrayAttr>: $global_static_sizes,
     Variadic<Index>: $global_dynamic_strides,
-    OptionalAttr<DenseI64ArrayAttr>: $global_static_strides)>,
+    OptionalAttr<DenseI64ArrayAttr>: $global_static_strides,
+    Variadic<Index>: $shared_dynamic_sizes,
+    OptionalAttr<DenseI64ArrayAttr>: $shared_static_sizes)>,
   Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
 
   let summary = "TODO";
@@ -1274,6 +1276,7 @@ def AMDGPU_MakeDmaDescriptorOp :
     $base
 	`globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
 	`globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
+	`sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
 	attr-dict `:` qualified(type($base)) `to` type(results)
   }];
 
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index f820060d2c718..e8a0bfe9476a7 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -361,6 +361,6 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x
 // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
 func.func @make_dma_descriptor_invalid_strides(%base: !amdgpu.tdm_base<i32>) {
   // expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}}
-  amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+  amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
   func.return
 }
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index a36f59718f175..0db84a187ddf5 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -700,8 +700,14 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
 // CHECK-LABEL: func @make_dma_descriptor
 // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
 func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) {
-  // CHECK: amdgpu.make_dma_descriptor %[[BASE]] globalSize [0] globalStride [1] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
-  amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+  // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+  // CHECK-SAME: globalSize [0]
+  // CHECK-SAME: globalStride [1]
+  // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+  amdgpu.make_dma_descriptor %base
+	globalSize [0]
+	globalStride [1]
+	sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
   func.return
 }
 

>From a24a840b4eb2a2d2daefc91d8b32738fc48cb9d4 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 15:11:16 -0500
Subject: [PATCH 08/13] Add optional atomic barrier

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 16ef34d1486cb..d73e35ce82806 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1264,7 +1264,10 @@ def AMDGPU_MakeDmaDescriptorOp :
     Variadic<Index>: $global_dynamic_strides,
     OptionalAttr<DenseI64ArrayAttr>: $global_static_strides,
     Variadic<Index>: $shared_dynamic_sizes,
-    OptionalAttr<DenseI64ArrayAttr>: $shared_static_sizes)>,
+    OptionalAttr<DenseI64ArrayAttr>: $shared_static_sizes,
+    Optional<AnyMemRef>: $atomic_barrier_address,
+    Variadic<Index>: $atomic_barrier_dynamic_indices,
+    OptionalAttr<DenseI64ArrayAttr>: $atomic_barrier_static_indices)>,
   Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
 
   let summary = "TODO";
@@ -1274,10 +1277,13 @@ def AMDGPU_MakeDmaDescriptorOp :
 
   let assemblyFormat = [{
     $base
-	`globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
-	`globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
-	`sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
-	attr-dict `:` qualified(type($base)) `to` type(results)
+    `globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
+    `globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
+    `sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
+    ( `atomicBarrier` `(` $atomic_barrier_address^
+                          custom<DynamicIndexList>($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices)
+                      `:` type($atomic_barrier_address) `)`)?
+    attr-dict `:` qualified(type($base)) `to` type(results)
   }];
 
   let hasVerifier = 1;

>From ccaf771d1fa91476adc4454991621d2e1d31d412 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 15:32:17 -0500
Subject: [PATCH 09/13] Add iterate

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  6 +++-
 mlir/test/Dialect/AMDGPU/ops.mlir             | 36 ++++++++++++++++---
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index d73e35ce82806..8c04e45a1983e 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1267,7 +1267,10 @@ def AMDGPU_MakeDmaDescriptorOp :
     OptionalAttr<DenseI64ArrayAttr>: $shared_static_sizes,
     Optional<AnyMemRef>: $atomic_barrier_address,
     Variadic<Index>: $atomic_barrier_dynamic_indices,
-    OptionalAttr<DenseI64ArrayAttr>: $atomic_barrier_static_indices)>,
+    OptionalAttr<DenseI64ArrayAttr>: $atomic_barrier_static_indices,
+    Optional<Index>: $global_increment,
+    Optional<Index>: $lds_increment,
+    Optional<Index>: $iteration_count)>,
   Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
 
   let summary = "TODO";
@@ -1283,6 +1286,7 @@ def AMDGPU_MakeDmaDescriptorOp :
     ( `atomicBarrier` `(` $atomic_barrier_address^
                           custom<DynamicIndexList>($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices)
                       `:` type($atomic_barrier_address) `)`)?
+    ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
     attr-dict `:` qualified(type($base)) `to` type(results)
   }];
 
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 0db84a187ddf5..6df7c300e5bc7 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -698,16 +698,42 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
 }
 
 // CHECK-LABEL: func @make_dma_descriptor
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
-func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32>, %idx: index) {
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
-  // CHECK-SAME: globalSize [0]
-  // CHECK-SAME: globalStride [1]
-  // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
   amdgpu.make_dma_descriptor %base
+        // CHECK-SAME: globalSize [0]
 	globalSize [0]
+        // CHECK-SAME: globalStride [1]
 	globalStride [1]
+        // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
 	sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+
+  // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+  amdgpu.make_dma_descriptor %base
+        // CHECK-SAME: globalSize [0]
+	globalSize [0]
+        // CHECK-SAME: globalStride [1]
+	globalStride [1]
+        // CHECK-SAME: sharedSize [0]
+	sharedSize [0]
+        // CHECK-SAME: atomicBarrier(%[[BARRIER]] [0] : memref<8xi32>)
+	atomicBarrier(%barrier [0] : memref<8xi32>)
+	: !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+
+  // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+  amdgpu.make_dma_descriptor %base
+        // CHECK-SAME: globalSize [0]
+	globalSize [0]
+        // CHECK-SAME: globalStride [1]
+	globalStride [1]
+        // CHECK-SAME: sharedSize [0]
+	sharedSize [0]
+        iterate %idx, %idx, %idx
+        // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
+	: !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+
+
   func.return
 }
 

>From 566d2e61a7f372e2308de60e6c9a224fcd309954 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 16:13:51 -0500
Subject: [PATCH 10/13] [mlir][amdgpu] Add make_dma_descriptor.

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  5 ++++
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  | 22 ++++++++++++++++
 mlir/test/Dialect/AMDGPU/ops.mlir             | 25 +++++++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 8c04e45a1983e..d33605220c442 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1265,6 +1265,10 @@ def AMDGPU_MakeDmaDescriptorOp :
     OptionalAttr<DenseI64ArrayAttr>: $global_static_strides,
     Variadic<Index>: $shared_dynamic_sizes,
     OptionalAttr<DenseI64ArrayAttr>: $shared_static_sizes,
+    Optional<Index>: $pad,
+    OptionalAttr<IndexAttr>: $pad_const,
+    Optional<Index>: $every,
+    OptionalAttr<IndexAttr>: $every_const,
     Optional<AnyMemRef>: $atomic_barrier_address,
     Variadic<Index>: $atomic_barrier_dynamic_indices,
     OptionalAttr<DenseI64ArrayAttr>: $atomic_barrier_static_indices,
@@ -1283,6 +1287,7 @@ def AMDGPU_MakeDmaDescriptorOp :
     `globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
     `globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
     `sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
+    ( `padShared` `(` custom<DynamicIndex>($pad, $pad_const)^ `every` custom<DynamicIndex>($every, $every_const) `)` )?
     ( `atomicBarrier` `(` $atomic_barrier_address^
                           custom<DynamicIndexList>($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices)
                       `:` type($atomic_barrier_address) `)`)?
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 4ade1164317af..b382fec21f20a 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -50,6 +50,28 @@ struct AMDGPUInlinerInterface final : DialectInlinerInterface {
 };
 } // namespace
 
+static ParseResult
+parseDynamicIndex(OpAsmParser &parser,
+                  std::optional<OpAsmParser::UnresolvedOperand> dynamicSize,
+                  IntegerAttr &staticSize) {
+  int64_t staticVal;
+  if (parser.parseOptionalInteger(staticVal).has_value()) {
+    staticSize = parser.getBuilder().getIndexAttr(staticVal);
+    return success();
+  }
+
+  return parser.parseOperand(dynamicSize.value());
+}
+
+static void printDynamicIndex(OpAsmPrinter &printer, Operation *op,
+                              Value dynamicSize, IntegerAttr staticSize) {
+  if (staticSize) {
+    printer << staticSize.getValue();
+  } else {
+    printer << dynamicSize;
+  }
+}
+
 void AMDGPUDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 6df7c300e5bc7..36a4f1644c28a 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -700,6 +700,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
 // CHECK-LABEL: func @make_dma_descriptor
 // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index)
 func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32>, %idx: index) {
+
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
         // CHECK-SAME: globalSize [0]
@@ -709,6 +710,30 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
         // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
 	sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
 
+  // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+  amdgpu.make_dma_descriptor %base
+        // CHECK-SAME: globalSize [0]
+	globalSize [0]
+        // CHECK-SAME: globalStride [1]
+	globalStride [1]
+        // CHECK-SAME: sharedSize [0]
+	sharedSize [0]
+        // CHECK-SAME: padShared(1 every 1)
+	padShared(1 every 1)
+	: !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+
+  // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+  amdgpu.make_dma_descriptor %base
+        // CHECK-SAME: globalSize [0]
+	globalSize [0]
+        // CHECK-SAME: globalStride [1]
+	globalStride [1]
+        // CHECK-SAME: sharedSize [0]
+	sharedSize [0]
+        // CHECK-SAME: padShared(1 every 1)
+	padShared(%idx every %idx)
+	: !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
         // CHECK-SAME: globalSize [0]

>From 2be4ccccbcd1971da13173da3087ba8b8c56208e Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 16:21:48 -0500
Subject: [PATCH 11/13] Fix indentation

---
 mlir/test/Dialect/AMDGPU/ops.mlir | 46 +++++++++++++++----------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 36a4f1644c28a..0bc13e4256244 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -704,59 +704,59 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
         // CHECK-SAME: globalSize [0]
-	globalSize [0]
+        globalSize [0]
         // CHECK-SAME: globalStride [1]
-	globalStride [1]
+        globalStride [1]
         // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
-	sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
 
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
         // CHECK-SAME: globalSize [0]
-	globalSize [0]
+        globalSize [0]
         // CHECK-SAME: globalStride [1]
-	globalStride [1]
+        globalStride [1]
         // CHECK-SAME: sharedSize [0]
-	sharedSize [0]
+        sharedSize [0]
         // CHECK-SAME: padShared(1 every 1)
-	padShared(1 every 1)
-	: !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        padShared(1 every 1)
+        : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
 
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
         // CHECK-SAME: globalSize [0]
-	globalSize [0]
+        globalSize [0]
         // CHECK-SAME: globalStride [1]
-	globalStride [1]
+        globalStride [1]
         // CHECK-SAME: sharedSize [0]
-	sharedSize [0]
+        sharedSize [0]
         // CHECK-SAME: padShared(1 every 1)
-	padShared(%idx every %idx)
-	: !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        padShared(%idx every %idx)
+        : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
 
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
         // CHECK-SAME: globalSize [0]
-	globalSize [0]
+        globalSize [0]
         // CHECK-SAME: globalStride [1]
-	globalStride [1]
+        globalStride [1]
         // CHECK-SAME: sharedSize [0]
-	sharedSize [0]
+        sharedSize [0]
         // CHECK-SAME: atomicBarrier(%[[BARRIER]] [0] : memref<8xi32>)
-	atomicBarrier(%barrier [0] : memref<8xi32>)
-	: !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        atomicBarrier(%barrier [0] : memref<8xi32>)
+        : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
 
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
         // CHECK-SAME: globalSize [0]
-	globalSize [0]
+        globalSize [0]
         // CHECK-SAME: globalStride [1]
-	globalStride [1]
+        globalStride [1]
         // CHECK-SAME: sharedSize [0]
-	sharedSize [0]
-        iterate %idx, %idx, %idx
+        sharedSize [0]
         // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
-	: !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        iterate %idx, %idx, %idx
+        : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
 
 
   func.return

>From b3ba450d336c451b33685f67ddcd42e4a500d80c Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 17:10:42 -0500
Subject: [PATCH 12/13] Review

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 24 +++++++++++++++----
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  | 11 +++++----
 mlir/test/Dialect/AMDGPU/invalid.mlir         |  2 +-
 mlir/test/Dialect/AMDGPU/ops.mlir             | 20 ++++++++--------
 4 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index d33605220c442..981698a8d25e6 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1251,7 +1251,7 @@ def AMDGPU_MakeDmaBaseOp :
   }];
 
   let assemblyFormat = [{
-    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
+    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
   }];
 }
 
@@ -1277,9 +1277,25 @@ def AMDGPU_MakeDmaDescriptorOp :
     Optional<Index>: $iteration_count)>,
   Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
 
-  let summary = "TODO";
+  let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
   let description = [{
-    TODO
+     Make all descriptor groups needed by tensor memory operations.
+
+     The $base operand corresponds to the base pair addresses, one must be an address in LDS
+     while the other must be a global memory location.
+
+     $global_{static/dynamic}_sizes determine the size of the tensor.
+     $global_{static/dynamic}_strides determine the strides of the tensor.
+     $shared_{static/dynamic}_sizes determines the size of the tile.
+
+     Padding can be applied to the LDS address when copying from memory to LDS,
+     but not when copying from LDS to memory.
+     The values in the padded target addresses remain the same as before the operation was applied.
+
+     2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
+     $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
+     $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
+     $iterate_count determines how many times to iterate.
   }];
 
   let assemblyFormat = [{
@@ -1292,7 +1308,7 @@ def AMDGPU_MakeDmaDescriptorOp :
                           custom<DynamicIndexList>($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices)
                       `:` type($atomic_barrier_address) `)`)?
     ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
-    attr-dict `:` qualified(type($base)) `to` type(results)
+    attr-dict `:` qualified(type($base)) `->` type(results)
   }];
 
   let hasVerifier = 1;
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index b382fec21f20a..6863dc4ad3e7f 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -54,12 +54,13 @@ static ParseResult
 parseDynamicIndex(OpAsmParser &parser,
                   std::optional<OpAsmParser::UnresolvedOperand> dynamicSize,
                   IntegerAttr &staticSize) {
-  int64_t staticVal;
+
+  int64_t staticVal = 0;
   if (parser.parseOptionalInteger(staticVal).has_value()) {
     staticSize = parser.getBuilder().getIndexAttr(staticVal);
     return success();
   }
-
+  
   return parser.parseOperand(dynamicSize.value());
 }
 
@@ -67,9 +68,9 @@ static void printDynamicIndex(OpAsmPrinter &printer, Operation *op,
                               Value dynamicSize, IntegerAttr staticSize) {
   if (staticSize) {
     printer << staticSize.getValue();
-  } else {
-    printer << dynamicSize;
-  }
+    return;
+  } 
+  printer << dynamicSize;
 }
 
 void AMDGPUDialect::initialize() {
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index e8a0bfe9476a7..a72193d532ab9 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -361,6 +361,6 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x
 // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
 func.func @make_dma_descriptor_invalid_strides(%base: !amdgpu.tdm_base<i32>) {
   // expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}}
-  amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+  amdgpu.make_dma_descriptor %base globalSize [0] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
   func.return
 }
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 0bc13e4256244..2984bedac7bf5 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -689,11 +689,11 @@ func.func @memory_counter_wait() {
 // CHECK-LABEL: func @make_dma_base
 // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space<workgroup>>)
 func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space<workgroup>>) {
-  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
-  amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
+  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+  amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
 
-  // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
-  amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
+  // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> -> !amdgpu.tdm_base<i32>
+  amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> -> !amdgpu.tdm_base<i32>
   func.return
 }
 
@@ -707,8 +707,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
         globalSize [0]
         // CHECK-SAME: globalStride [1]
         globalStride [1]
-        // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
-        sharedSize [0] : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+        sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
 
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
@@ -720,7 +720,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
         sharedSize [0]
         // CHECK-SAME: padShared(1 every 1)
         padShared(1 every 1)
-        : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
 
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
@@ -732,7 +732,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
         sharedSize [0]
         // CHECK-SAME: padShared(1 every 1)
         padShared(%idx every %idx)
-        : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
 
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
@@ -744,7 +744,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
         sharedSize [0]
         // CHECK-SAME: atomicBarrier(%[[BARRIER]] [0] : memref<8xi32>)
         atomicBarrier(%barrier [0] : memref<8xi32>)
-        : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
 
   // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
   amdgpu.make_dma_descriptor %base
@@ -756,7 +756,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
         sharedSize [0]
         // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
         iterate %idx, %idx, %idx
-        : !amdgpu.tdm_base<i32> to !amdgpu.tdm_descriptor
+        : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
 
 
   func.return

>From d34c423efcaa0a655ed599e04d9ec56177270dcb Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 24 Nov 2025 17:53:03 -0500
Subject: [PATCH 13/13] Fix parser

---
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 16 +++++++++++-----
 mlir/test/Dialect/AMDGPU/ops.mlir            |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 6863dc4ad3e7f..f37ba43fcaa39 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -52,16 +52,22 @@ struct AMDGPUInlinerInterface final : DialectInlinerInterface {
 
 static ParseResult
 parseDynamicIndex(OpAsmParser &parser,
-                  std::optional<OpAsmParser::UnresolvedOperand> dynamicSize,
+                  std::optional<OpAsmParser::UnresolvedOperand> &dynamicSize,
                   IntegerAttr &staticSize) {
 
-  int64_t staticVal = 0;
-  if (parser.parseOptionalInteger(staticVal).has_value()) {
+  int64_t staticVal;
+  OptionalParseResult parseResult = parser.parseOptionalInteger(staticVal);
+  if (parseResult.has_value()) {
     staticSize = parser.getBuilder().getIndexAttr(staticVal);
     return success();
   }
-  
-  return parser.parseOperand(dynamicSize.value());
+
+  OpAsmParser::UnresolvedOperand operand = OpAsmParser::UnresolvedOperand{};
+  if (parser.parseOperand(operand)) {
+    dynamicSize = operand;
+    return success();
+  }
+  return failure();
 }
 
 static void printDynamicIndex(OpAsmPrinter &printer, Operation *op,
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 2984bedac7bf5..923b30ce95363 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -730,7 +730,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
         globalStride [1]
         // CHECK-SAME: sharedSize [0]
         sharedSize [0]
-        // CHECK-SAME: padShared(1 every 1)
+        // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]])
         padShared(%idx every %idx)
         : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
 



More information about the Mlir-commits mailing list