[Mlir-commits] [mlir] 650a62f - [mlir][ROCDL] Add async variants of pre-gfx12 LDS load intrinsics (#181072)

llvmlistbot at llvm.org llvmlistbot at llvm.org
Mon Feb 16 12:40:59 PST 2026


Author: Krzysztof Drewniak
Date: 2026-02-16T12:40:54-08:00
New Revision: 650a62f970aee31de29b3e88a4c8036001659197

URL: https://github.com/llvm/llvm-project/commit/650a62f970aee31de29b3e88a4c8036001659197
DIFF: https://github.com/llvm/llvm-project/commit/650a62f970aee31de29b3e88a4c8036001659197.diff

LOG: [mlir][ROCDL] Add async variants of pre-gfx12 LDS load intrinsics (#181072)

These are MLIR wrappers around #180466.

Co-authored-by: Claude Opus 4.5 <noreply at anthropic.com>

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
    mlir/test/Dialect/LLVMIR/rocdl.mlir
    mlir/test/Target/LLVMIR/rocdl.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index c3af1bd32ebda..58ef3db3c4a7a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -1102,6 +1102,49 @@ def ROCDL_LoadToLDSOp :
   }];
 }
 
+def ROCDL_LoadAsyncToLDSOp :
+  ROCDL_IntrOp<"load.async.to.lds", [], [0], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
+  dag args = (ins Arg<LLVM_AnyPointer, "", [MemRead]>:$globalPtr,
+                 Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+                 I32Attr:$size,
+                 I32Attr:$offset,
+                 I32Attr:$aux);
+  let arguments = !con(args, baseArgs);
+  let assemblyFormat = [{
+    $globalPtr `,`  $ldsPtr `,` $size `,` $offset `,` $aux
+    attr-dict `:` qualified(type($globalPtr)) `,` qualified(type($ldsPtr))
+  }];
+  let extraClassDefinition = [{
+    ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+      return {getGlobalPtr(), getLdsPtr()};
+    }
+  }];
+
+  let summary = "Gathering load to LDS that requires explicit async memory tracking";
+  let description = [{
+    Load `size` bytes (the valid sizes vary by architecture) from the global memory
+    pointed to by `globalPtr` and put them at `ldsPtr`, concantenating (and applying
+    padding for sizes less than 4 bytes, along with padding out 12-byte reads
+    to 16-byte writes). The value of `globalPtr` can vary between lanes, while
+    `sharedPtr` must be subgroup-uniform (the values from each lane are concatentated
+    before being written to LDS with appropriate padding applied.)
+
+    `offset` is a constant offset applied to **both** pointers, and `aux` sets the cache
+    policy. Unlike `rocdl.load.to.lds`, the compiler will not automatically inserts waits
+    for this load to complete at the point it thinks you're using a region of LDS you've
+    stored values to - you need to use the `rocdl.asyncmark` and `rocdl.wait.asyncmark`
+    operations to explicitly group these operations and wait for their completion.
+
+    Available on gfx10 and earlier with varying suppported values of `size`.
+
+    Example:
+    ```mlir
+    rocdl.load.async.to.lds %global, %shared, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+    rocdl.load.async.to.lds %fatBuffer, %shared, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3>
+    ```
+  }];
+}
+
 def ROCDL_GlobalLoadLDSOp :
   ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
   dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
@@ -1121,6 +1164,40 @@ def ROCDL_GlobalLoadLDSOp :
   }];
 }
 
+def ROCDL_GlobalLoadAsyncLDSOp :
+  ROCDL_IntrOp<"global.load.async.lds", [], [], [], 0, 0, 1, 0, [2, 3, 4], ["size", "offset", "aux"]> {
+  dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
+                 Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+                 I32Attr:$size,
+                 I32Attr:$offset,
+                 I32Attr:$aux);
+  let arguments = !con(args, baseArgs);
+  let assemblyFormat = [{
+    $globalPtr `,`  $ldsPtr `,` $size `,` $offset `,` $aux
+    attr-dict `:` qualified(type($globalPtr)) `,` qualified(type($ldsPtr))
+  }];
+  let extraClassDefinition = [{
+    ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+      return {getGlobalPtr(), getLdsPtr()};
+    }
+  }];
+
+  let summary = "Version of rocdl.load.async.to.lds specialized to global pointers";
+  let description = [{
+    This operation works identically to `rocdl.load.async.to.lds` except that the
+    global pointer argument is limited to pointers in address space 1 (pure global
+    pointers) instead of also allowing fat buffer pointers.
+
+    Available on gfx9 and gfx10.
+
+    For the operation introduced in gfx1250, see `rocdl.global.load.async.to.lds.bN`.
+    Example:
+    ```mlir
+    rocdl.load.async.to.lds %global, %shared, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+    ```
+  }];
+}
+
 //===---------------------------------------------------------------------===//
 // Async load to LDS intrinsic (available in GFX1250)
 //===---------------------------------------------------------------------===//
@@ -1293,6 +1370,34 @@ def ROCDL_RawPtrBufferLoadLdsOp :
   }];
 }
 
+def ROCDL_RawPtrBufferLoadAsyncLdsOp :
+  ROCDL_IntrOp<"raw.ptr.buffer.load.async.lds", [], [], [], 0, 0, 1> {
+  dag args = (ins Arg<ROCDLBufferRsrc, "", [MemRead]>:$rsrc,
+                  Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+                  I32:$size,
+                  I32:$voffset,
+                  I32:$soffset,
+                  I32:$offset,
+                  I32:$aux);
+  let arguments = !con(args, baseArgs);
+  let assemblyFormat = "operands attr-dict";
+  let extraClassDefinition = [{
+    ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+      return {getRsrc(), getLdsPtr()};
+    }
+  }];
+  let summary = "Async variant of raw.ptr.buffer.load.lds";
+  let description = [{
+    Load from a buffer resource `rsrc` to `ldsPtr`, which must be uniform.
+
+    See `rocdl.load.async.to.lds` for overall semantics of such loads, noting that
+    here `voffset` can be lane-varying and that `rsrc` (which holds the base addres)
+    must, as always, be uniform.
+
+    Available on gfx9 and gfx10.
+  }];
+}
+
 def ROCDL_RawPtrBufferStoreOp :
   ROCDL_IntrOp<"raw.ptr.buffer.store", [], [0], [], 0, 0, 1> {
   dag args = (ins LLVM_Type:$vdata,

diff  --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 2adb5bc90915a..d01763ed87502 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -758,18 +758,32 @@ llvm.func @rocdl.load.tr.ops(%gl_ptr : !llvm.ptr<1>, %ds_ptr : !llvm.ptr<3>) {
 
 llvm.func @rocdl.load.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
   // CHECK-LABEL @rocdl.load.to.lds
-  //CHECK: rocdl.load.to.lds %{{.*}}, %{{.*}}, 4, 0, 0 : <7>
+  // CHECK: rocdl.load.to.lds %{{.*}}, %{{.*}}, 4, 0, 0 : <7>
   rocdl.load.to.lds %src, %dst, 4, 0, 0 : <7>
   llvm.return
 }
 
+llvm.func @rocdl.load.async.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
+  // CHECK-LABEL @rocdl.load.async.to.lds
+  // CHECK: rocdl.load.async.to.lds %{{.*}}, %{{.*}}, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3>
+  rocdl.load.async.to.lds %src, %dst, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3>
+  llvm.return
+}
+
 llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   // CHECK-LABEL @rocdl.global.load.lds
-  //CHECK: rocdl.global.load.lds %{{.*}}, %{{.*}}, 4, 0, 0
+  // CHECK: rocdl.global.load.lds %{{.*}}, %{{.*}}, 4, 0, 0
   rocdl.global.load.lds %src, %dst, 4, 0, 0
   llvm.return
 }
 
+llvm.func @rocdl.global.load.async.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK-LABEL @rocdl.global.load.async.lds
+  // CHECK: rocdl.global.load.async.lds %{{.*}}, %{{.*}}, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  rocdl.global.load.async.lds %src, %dst, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  llvm.return
+}
+
 llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   // CHECK-LABEL @rocdl.global.load.async.to.lds
   // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0
@@ -877,6 +891,16 @@ llvm.func @rocdl.raw.ptr.buffer.load.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.p
   llvm.return
 }
 
+llvm.func @rocdl.raw.ptr.buffer.load.async.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.ptr<3>,
+                       %size: i32, %voffset : i32, %soffset : i32, %offset : i32,
+                       %aux : i32) {
+  // CHECK-LABEL: rocdl.raw.ptr.buffer.load.async.lds
+  // CHECK: rocdl.raw.ptr.buffer.load.async.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}
+  rocdl.raw.ptr.buffer.load.async.lds %rsrc, %dstLds, %size, %voffset, %soffset, %offset, %aux
+
+  llvm.return
+}
+
 llvm.func @rocdl.raw.ptr.buffer.i32(%rsrc : !llvm.ptr<8>,
                        %offset : i32, %soffset : i32,
                        %aux : i32, %vdata1 : i32,

diff  --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 7a7e76410e4d2..0d6e0c8aea500 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1224,12 +1224,24 @@ llvm.func @rocdl.load.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+llvm.func @rocdl.load.async.to.lds(%src : !llvm.ptr<7>, %dst: !llvm.ptr<3>) {
+  //CHECK: call void @llvm.amdgcn.load.async.to.lds.p7
+  rocdl.load.async.to.lds %src, %dst, 4, 0, 0 : !llvm.ptr<7>, !llvm.ptr<3>
+  llvm.return
+}
+
 llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   //CHECK: call void @llvm.amdgcn.global.load.lds
   rocdl.global.load.lds %src, %dst, 4, 0, 0
   llvm.return
 }
 
+llvm.func @rocdl.global.load.async.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  //CHECK: call void @llvm.amdgcn.global.load.async.lds
+  rocdl.global.load.async.lds %src, %dst, 4, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  llvm.return
+}
+
 // CHECK-LABEL: rocdl.global.load.async.to.lds
 llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
@@ -1390,6 +1402,18 @@ llvm.func @rocdl.raw.ptr.buffer.load.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.p
   llvm.return
 }
 
+llvm.func @rocdl.raw.ptr.buffer.load.async.lds(%rsrc : !llvm.ptr<8>, %dstLds : !llvm.ptr<3>,
+                        %voffset : i32, %soffset : i32) {
+  %size = llvm.mlir.constant(4 : i32) : i32
+  %offset = llvm.mlir.constant(128 : i32) : i32
+  %aux = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-LABEL: rocdl.raw.ptr.buffer.load.async.lds
+  // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %{{.*}}, ptr addrspace(3) %{{.*}}, i32 4, i32 %{{.*}}, i32 %{{.*}}, i32 128, i32 1
+  rocdl.raw.ptr.buffer.load.async.lds %rsrc, %dstLds, %size, %voffset, %soffset, %offset, %aux
+
+  llvm.return
+}
+
 llvm.func @rocdl.global.prefetch(%ptr : !llvm.ptr<1>) {
   // CHECK-LABEL: rocdl.global.prefetch
   // CHECK: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %{{.*}}, i32 0)


        


More information about the Mlir-commits mailing list