[Mlir-commits] [mlir] [mlir][rocdl] Add GlobalLoadAsyncToLDS operation (PR #165374)

Pablo Antonio Martinez llvmlistbot at llvm.org
Fri Oct 31 02:13:03 PDT 2025


https://github.com/pabloantoniom updated https://github.com/llvm/llvm-project/pull/165374

>From fe4f87b8eec1bf236494ea8597319140aa38b484 Mon Sep 17 00:00:00 2001
From: Pablo Antonio Martinez <pamartin at amd.com>
Date: Tue, 28 Oct 2025 06:47:08 -0500
Subject: [PATCH 1/2] [mlir][rocdl] Add GlobalLoadAsyncToLDS operation

Adds `global.load.async.to.lds` op to rocdl, supporting `b8`, `b32`,
`b64` and `b128`. The op is lowered to the appropiate
`llvm.amdgcn.global.load.async.to.lds.bXX` intrinsic.

This is available on gfx1250+.
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 33 ++++++++++++++++++++
 mlir/test/Dialect/LLVMIR/rocdl.mlir          | 13 ++++++++
 mlir/test/Target/LLVMIR/rocdl.mlir           | 24 ++++++++++++++
 3 files changed, 70 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 5241f9a6f2b43..db88aad4871e8 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -692,6 +692,39 @@ def ROCDL_GlobalLoadLDSOp :
   }];
 }
 
+//===---------------------------------------------------------------------===//
+// Async load to LDS intrinsic (available in GFX1250)
+//===---------------------------------------------------------------------===//
+
+class ROCDL_GlobalLoadAsyncToLDSOp<string mnemonic> :
+  ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
+  dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
+                 Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+                 I32Attr:$offset,
+                 I32Attr:$aux);
+  let arguments = !con(args, baseArgs);
+  let assemblyFormat = [{
+    $globalPtr `,`  $ldsPtr `,` $offset `,` $aux
+    attr-dict `:` type($globalPtr)
+  }];
+  let description = [{
+    Loads data asynchronously from a global memory pointer to a local data
+    store (LDS) pointer.
+
+    Available on gfx1250+.
+  }];
+  let extraClassDefinition = [{
+    ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+      return {getGlobalPtr(), getLdsPtr()};
+    }
+  }];
+}
+
+def ROCDL_GlobalLoadAsyncToLDSB8Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b8">;
+def ROCDL_GlobalLoadAsyncToLDSB32Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b32">;
+def ROCDL_GlobalLoadAsyncToLDSB64Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b64">;
+def ROCDL_GlobalLoadAsyncToLDSB128Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b128">;
+
 //===---------------------------------------------------------------------===//
 // Tensor load/store intrinsics (available in GFX1250)
 //===---------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index e703600c71c8e..308f9036d3dcf 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -664,6 +664,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK-LABEL @rocdl.global.load.async.to.lds
+  // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b32 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b64 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b128 %{{.*}}, %{{.*}}, 0, 0
+  rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : <1>
+  rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : <1>
+  rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : <1>
+  rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : <1>
+  llvm.return
+}
+
 // CHECK-LABEL @rocdl.tensor.load.to.lds
 llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
                                     %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 8a848221a50dd..c58c1145e77d4 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1040,6 +1040,30 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+llvm.func @rocdl.global.load.async.lds.b8(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
+  rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>
+  llvm.return
+}
+
+llvm.func @rocdl.global.load.async.lds.b32(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32
+  rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>
+  llvm.return
+}
+
+llvm.func @rocdl.global.load.async.lds.b64(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64
+  rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>
+  llvm.return
+}
+
+llvm.func @rocdl.global.load.async.lds.b128(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128
+  rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>
+  llvm.return
+}
+
 // CHECK-LABEL: rocdl.tensor.load.to.lds
 llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
                                     %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {

>From 4a11e8038ca071f883b09b1932f979cb2b0d16a7 Mon Sep 17 00:00:00 2001
From: Pablo Antonio Martinez <pamartin at amd.com>
Date: Fri, 31 Oct 2025 03:26:48 -0500
Subject: [PATCH 2/2] Address ravil comments, note that I changed the foreach
 slightly

---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 51 ++++++++++----------
 mlir/test/Target/LLVMIR/rocdl.mlir           | 15 +-----
 2 files changed, 27 insertions(+), 39 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index db88aad4871e8..4e8d9edf174da 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -696,34 +696,33 @@ def ROCDL_GlobalLoadLDSOp :
 // Async load to LDS intrinsic (available in GFX1250)
 //===---------------------------------------------------------------------===//
 
-class ROCDL_GlobalLoadAsyncToLDSOp<string mnemonic> :
-  ROCDL_IntrOp<mnemonic, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
-  dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
-                 Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
-                 I32Attr:$offset,
-                 I32Attr:$aux);
-  let arguments = !con(args, baseArgs);
-  let assemblyFormat = [{
-    $globalPtr `,`  $ldsPtr `,` $offset `,` $aux
-    attr-dict `:` type($globalPtr)
-  }];
-  let description = [{
-    Loads data asynchronously from a global memory pointer to a local data
-    store (LDS) pointer.
+foreach bitsVal = [8, 32, 64, 128] in {
+  defvar bitsStr = "b" # !cast<string>(bitsVal);
+  def ROCDL_GlobalLoadAsyncToLDS # !toupper(bitsStr) # Op :
+    ROCDL_IntrOp<"global.load.async.to.lds." # bitsStr, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
+    dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
+                   Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+                   I32Attr:$offset,
+                   I32Attr:$aux);
+    let arguments = !con(args, baseArgs);
+    let assemblyFormat = [{
+      $globalPtr `,`  $ldsPtr `,` $offset `,` $aux
+      attr-dict `:` type($globalPtr)
+    }];
+    let description = [{
+      Asynchronously loads }] # !cast<string>(bitsVal) # [{ bits of data from a global memory pointer
+      to a Local Data Share (LDS) pointer.
 
-    Available on gfx1250+.
-  }];
-  let extraClassDefinition = [{
-    ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
-      return {getGlobalPtr(), getLdsPtr()};
-    }
-  }];
-}
+      Available on gfx1250+.
+    }];
 
-def ROCDL_GlobalLoadAsyncToLDSB8Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b8">;
-def ROCDL_GlobalLoadAsyncToLDSB32Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b32">;
-def ROCDL_GlobalLoadAsyncToLDSB64Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b64">;
-def ROCDL_GlobalLoadAsyncToLDSB128Op : ROCDL_GlobalLoadAsyncToLDSOp<"global.load.async.to.lds.b128">;
+    let extraClassDefinition = [{
+      ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+        return {getGlobalPtr(), getLdsPtr()};
+      }
+    }];
+  }
+}
 
 //===---------------------------------------------------------------------===//
 // Tensor load/store intrinsics (available in GFX1250)
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index c58c1145e77d4..410c81f52d0a6 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1040,25 +1040,14 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
-llvm.func @rocdl.global.load.async.lds.b8(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+// CHECK-LABEL: rocdl.global.load.async.to.lds
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
   rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>
-  llvm.return
-}
-
-llvm.func @rocdl.global.load.async.lds.b32(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32
   rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>
-  llvm.return
-}
-
-llvm.func @rocdl.global.load.async.lds.b64(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64
   rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>
-  llvm.return
-}
-
-llvm.func @rocdl.global.load.async.lds.b128(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128
   rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>
   llvm.return



More information about the Mlir-commits mailing list