[Mlir-commits] [mlir] 930ef77 - [mlir][amdgpu] Add optional write mask to amdgpu.global_load_async_to_lds (#190498)

Mon Apr 6 09:21:38 PDT 2026

Author: Eric Feng
Date: 2026-04-06T09:21:32-07:00
New Revision: 930ef7736e0bb4550821e4d66beb498f974bb837

URL: https://github.com/llvm/llvm-project/commit/930ef7736e0bb4550821e4d66beb498f974bb837
DIFF: https://github.com/llvm/llvm-project/commit/930ef7736e0bb4550821e4d66beb498f974bb837.diff

LOG: [mlir][amdgpu] Add optional write mask to amdgpu.global_load_async_to_lds (#190498)

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
    mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
    mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
    mlir/test/Dialect/AMDGPU/ops.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index e2a089f1fe628..8dafc757c2e85 100644

--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1302,7 +1302,8 @@ def AMDGPU_GlobalLoadAsyncToLDSOp :
                    Variadic<Index>:$srcIndices,
                    Arg<AnyMemRef, "LDS memory to write to", [MemWrite]>:$dst,
                    Variadic<Index>:$dstIndices,
-                   TypeAttr:$transferType
+                   TypeAttr:$transferType,
+                   Optional<I1>: $mask
                    )>,
     Results<(outs)> {
   let summary = "MLIR wrapper for async global load to lds instructions";
@@ -1316,6 +1317,8 @@ def AMDGPU_GlobalLoadAsyncToLDSOp :
     * `$dstIndices`: indices into `$dst` for this thread's LDS write location.
     * `$transferType`: type of data to be transferred. Must be 8, 32, 64 or 128 bit scalar
      or vector type.
+    * `$mask`: optional per-thread mask. When false, the thread's LDS write
+      is masked off. The global read still occurs for all threads regardless of mask.
 
     Note: only supported on gfx1250 and later.
 
@@ -1335,7 +1338,7 @@ def AMDGPU_GlobalLoadAsyncToLDSOp :
     ```
   }];
   let assemblyFormat = [{
-    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]`
+    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]`  (`,` $mask^)?
     attr-dict `:` $transferType `,` type($src) `,` type($dst)
   }];
   let hasVerifier = 1;

diff  --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 423d261bc188a..a2ec8a6ea25b1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -28,8 +28,10 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
 #include <optional>
 
 namespace mlir {
@@ -2109,6 +2111,17 @@ struct GlobalLoadAsyncToLDSOpLowering
         getStridedElementPtr(rewriter, loc, dstMemRefType, adaptor.getDst(),
                              adaptor.getDstIndices());
 
+    if (op.getMask()) {
+      Value mask = adaptor.getMask();
+      int64_t nullptrVal =
+          llvm::AMDGPU::getNullPointerValue(llvm::AMDGPUAS::LOCAL_ADDRESS);
+      Value nullInt =
+          createI32Constant(rewriter, loc, static_cast<int32_t>(nullptrVal));
+      Value nullPtr =
+          LLVM::IntToPtrOp::create(rewriter, loc, dstPtr.getType(), nullInt);
+      dstPtr = LLVM::SelectOp::create(rewriter, loc, mask, dstPtr, nullPtr);
+    }
+
     auto offset = rewriter.getI32IntegerAttr(0);
     auto aux = rewriter.getI32IntegerAttr(0);
 

diff  --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index aa306d82399e8..e43ece8c74fdf 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -1014,3 +1014,20 @@ func.func @global_load_async_to_lds_dynamic_indices(
       memref<256xi32, #gpu.address_space<workgroup>>
   func.return
 }
+
+// -----
+
+// CHECK-LABEL: func @global_load_async_to_lds_b128_masked
+func.func @global_load_async_to_lds_b128_masked(
+    %global : memref<128x72xf32, #gpu.address_space<global>>, %mask : i1) {
+  %c0 = arith.constant 0 : index
+  %alloc = memref.alloc() : memref<64x64xf32, #gpu.address_space<workgroup>>
+  // CHECK: [[NULLPTR_INT:%.*]] = llvm.mlir.constant(-1 : i32) : i32
+  // CHECK: [[NULLPTR:%.*]] = llvm.inttoptr [[NULLPTR_INT]] : i32 to !llvm.ptr<3>
+  // CHECK: [[DST:%.*]] = llvm.select {{.*}}, {{.*}}, [[NULLPTR]] : i1, !llvm.ptr<3>
+  // CHECK: rocdl.global.load.async.to.lds.b128 {{.*}}, [[DST]]
+  amdgpu.global_load_async_to_lds %global[%c0, %c0], %alloc[%c0, %c0], %mask
+    : vector<4xf32>, memref<128x72xf32, #gpu.address_space<global>>,
+      memref<64x64xf32, #gpu.address_space<workgroup>>
+  func.return
+}

diff  --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 589e7dd0a652d..6f4dd486610cc 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -682,10 +682,11 @@ func.func @gather_to_lds_0d(%mem1 : memref<f16>, %smem1 : memref<f16, #gpu.addre
 }
 
 // CHECK-LABEL: func @global_load_async_to_lds
-func.func @global_load_async_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf32, #gpu.address_space<global>>, %mem2 : memref<32x32xf32, #gpu.address_space<global>>, %smem1 : memref<32xf32, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf32, #gpu.address_space<workgroup>>) {
+func.func @global_load_async_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf32, #gpu.address_space<global>>, %mem2 : memref<32x32xf32, #gpu.address_space<global>>, %smem1 : memref<32xf32, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf32, #gpu.address_space<workgroup>>, %mask : i1) {
   // CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
   // CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
   // CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+  // CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
   amdgpu.global_load_async_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2]
     : f32, memref<32x32xf32, #gpu.address_space<global>>,
       memref<32x32xf32, #gpu.address_space<workgroup>>
@@ -695,6 +696,9 @@ func.func @global_load_async_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref
   amdgpu.global_load_async_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2]
     : vector<2xf32>, memref<32x32xf32, #gpu.address_space<global>>,
       memref<32x32xf32, #gpu.address_space<workgroup>>
+  amdgpu.global_load_async_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2], %mask
+    : f32, memref<32x32xf32, #gpu.address_space<global>>,
+      memref<32x32xf32, #gpu.address_space<workgroup>>
   func.return
 }