[Mlir-commits] [mlir] [mlir][amdgpu] Add amdgpu.make_dma_descriptor (PR #169407)

Wed Nov 26 11:45:08 PST 2025

================
@@ -1192,4 +1227,132 @@ def AMDGPU_ScaledMFMAOp :
   }];
   let hasCanonicalizer = 1;
 }
+
+def AMDGPU_MakeDmaBaseOp :
+    AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
+    Arguments<(ins
+                   Arg<AnyMemRef, "buffer to read from">:$src,
+                   Variadic<Index>:$src_indices,
+                   OptionalAttr<DenseI64ArrayAttr>: $src_indices_const,
+                   Arg<AnyMemRef, "buffer to write to">:$dst,
+                   Variadic<Index>:$dst_indices,
+                   OptionalAttr<DenseI64ArrayAttr>: $dst_indices_const)>,
+    Results<(outs AMDGPU_TDMBaseType: $base)> {
+
+  // TODO:
+  // * Add verifiers such that one of the memrefs is from LDS and the other global.
+  // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
+
+  let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
+  let description = [{
+    This operation creates a pair of addresses that will be used by tensor_load_to_lds
+    and tensor_store_from_lds.
+
+    This operation creates a value corresponding roughly to the descriptor group 0
+    found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.
+    For example:
+
+    ```mlir
+      %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+      %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+      amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+    ```
+
+    to
+
+    ```mlir
+       // pseudocode
+       %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)>
+       %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)>
+       %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)>
+       // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base<i32>
+
+       // The base will be used when contructing dgroup0
+       // when lowering amdgpu.make_dma_descriptor
+       %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)>
+       %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : ....
+
+       // When lowering amdgpu.tensor_load_to_lds
+       rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+    ```
+  }];
+
+  // TODO: Define a custom printer, parser to avoid space between $src/%dst and indices.
+  let assemblyFormat = [{
+    $src custom<DynamicIndexList>($src_indices, $src_indices_const) `,`
+    $dst custom<DynamicIndexList>($dst_indices, $dst_indices_const) attr-dict `:` type($src) `,` type($dst) `->` type(results)
+  }];
+}
+
+def AMDGPU_MakeDmaDescriptorOp :
+  AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
+  Arguments<(ins
+    AMDGPU_TDMBaseType: $base,
+    Variadic<Index>: $global_dynamic_sizes,
+    OptionalAttr<DenseI64ArrayAttr>: $global_static_sizes,
+    Variadic<Index>: $global_dynamic_strides,
+    OptionalAttr<DenseI64ArrayAttr>: $global_static_strides,
+    Variadic<Index>: $shared_dynamic_sizes,
+    OptionalAttr<DenseI64ArrayAttr>: $shared_static_sizes,
+    Optional<Index>: $pad,
+    OptionalAttr<IndexAttr>: $pad_const,
+    Optional<Index>: $every,
+    OptionalAttr<IndexAttr>: $every_const,
+    Optional<AnyMemRef>: $atomic_barrier_address,
+    Variadic<Index>: $atomic_barrier_dynamic_indices,
+    OptionalAttr<DenseI64ArrayAttr>: $atomic_barrier_static_indices,
+    Optional<Index>: $global_increment,
+    Optional<Index>: $lds_increment,
+    Optional<Index>: $iteration_count)>,
+  Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
+
+  let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
+  let description = [{
+     Make all descriptor groups needed by tensor memory operations.
+
+     The $base operand corresponds to the base pair addresses, one must be an address in LDS
+     while the other must be a global memory location.
+
+     $global_{static/dynamic}_sizes determine the size of the tensor.
+     $global_{static/dynamic}_strides determine the strides of the tensor.
+     $shared_{static/dynamic}_sizes determines the size of the tile.
+
+     Padding can be applied to the LDS address when copying from memory to LDS,
+     but not when copying from LDS to memory.
+     The values in the padded target addresses remain the same as before the operation was applied.
+
+     2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
+     $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
+     $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
+     $iterate_count determines how many times to iterate.
+
+     ```mlir
+      // Example of moving a two-dimensional tensor to LDS.
+      %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+      %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+      amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+
+      // Example of moving a two dimension tensor to LDS where padding is applied after every integer.
+      %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+      %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(1 every 1) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+      amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+     ```
+  }];
+
+  let assemblyFormat = [{
+    $base
+    `globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
+    `globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
+    `sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
+    ( `padShared` `(` custom<DynamicIndex>($pad, $pad_const)^ `every` custom<DynamicIndex>($every, $every_const) `)` )?
+    ( `atomicBarrier` `(` $atomic_barrier_address^
+                          custom<DynamicIndexList>($atomic_barrier_dynamic_indices, $atomic_barrier_static_indices)
+                      `:` type($atomic_barrier_address) `)`)?
+    ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
+    attr-dict `:` qualified(type($base)) `->` type(results)
+  }];
+
+  let hasVerifier = 1;
----------------
krzysz00 wrote:

We'll want the `OpFoldResult` helpers for combining the static/dynamic parts, I claim.

https://github.com/llvm/llvm-project/pull/169407