[Mlir-commits] [mlir] 930ef77 - [mlir][amdgpu] Add optional write mask to amdgpu.global_load_async_to_lds (#190498)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Apr 6 09:21:38 PDT 2026
Author: Eric Feng
Date: 2026-04-06T09:21:32-07:00
New Revision: 930ef7736e0bb4550821e4d66beb498f974bb837
URL: https://github.com/llvm/llvm-project/commit/930ef7736e0bb4550821e4d66beb498f974bb837
DIFF: https://github.com/llvm/llvm-project/commit/930ef7736e0bb4550821e4d66beb498f974bb837.diff
LOG: [mlir][amdgpu] Add optional write mask to amdgpu.global_load_async_to_lds (#190498)
Added:
Modified:
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
mlir/test/Dialect/AMDGPU/ops.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index e2a089f1fe628..8dafc757c2e85 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1302,7 +1302,8 @@ def AMDGPU_GlobalLoadAsyncToLDSOp :
Variadic<Index>:$srcIndices,
Arg<AnyMemRef, "LDS memory to write to", [MemWrite]>:$dst,
Variadic<Index>:$dstIndices,
- TypeAttr:$transferType
+ TypeAttr:$transferType,
+ Optional<I1>: $mask
)>,
Results<(outs)> {
let summary = "MLIR wrapper for async global load to lds instructions";
@@ -1316,6 +1317,8 @@ def AMDGPU_GlobalLoadAsyncToLDSOp :
* `$dstIndices`: indices into `$dst` for this thread's LDS write location.
* `$transferType`: type of data to be transferred. Must be 8, 32, 64 or 128 bit scalar
or vector type.
+ * `$mask`: optional per-thread mask. When false, the thread's LDS write
+ is masked off. The global read still occurs for all threads regardless of mask.
Note: only supported on gfx1250 and later.
@@ -1335,7 +1338,7 @@ def AMDGPU_GlobalLoadAsyncToLDSOp :
```
}];
let assemblyFormat = [{
- $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]`
+ $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` (`,` $mask^)?
attr-dict `:` $transferType `,` type($src) `,` type($dst)
}];
let hasVerifier = 1;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 423d261bc188a..a2ec8a6ea25b1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -28,8 +28,10 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
#include <optional>
namespace mlir {
@@ -2109,6 +2111,17 @@ struct GlobalLoadAsyncToLDSOpLowering
getStridedElementPtr(rewriter, loc, dstMemRefType, adaptor.getDst(),
adaptor.getDstIndices());
+ if (op.getMask()) {
+ Value mask = adaptor.getMask();
+ int64_t nullptrVal =
+ llvm::AMDGPU::getNullPointerValue(llvm::AMDGPUAS::LOCAL_ADDRESS);
+ Value nullInt =
+ createI32Constant(rewriter, loc, static_cast<int32_t>(nullptrVal));
+ Value nullPtr =
+ LLVM::IntToPtrOp::create(rewriter, loc, dstPtr.getType(), nullInt);
+ dstPtr = LLVM::SelectOp::create(rewriter, loc, mask, dstPtr, nullPtr);
+ }
+
auto offset = rewriter.getI32IntegerAttr(0);
auto aux = rewriter.getI32IntegerAttr(0);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index aa306d82399e8..e43ece8c74fdf 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -1014,3 +1014,20 @@ func.func @global_load_async_to_lds_dynamic_indices(
memref<256xi32, #gpu.address_space<workgroup>>
func.return
}
+
+// -----
+
+// CHECK-LABEL: func @global_load_async_to_lds_b128_masked
+func.func @global_load_async_to_lds_b128_masked(
+ %global : memref<128x72xf32, #gpu.address_space<global>>, %mask : i1) {
+ %c0 = arith.constant 0 : index
+ %alloc = memref.alloc() : memref<64x64xf32, #gpu.address_space<workgroup>>
+ // CHECK: [[NULLPTR_INT:%.*]] = llvm.mlir.constant(-1 : i32) : i32
+ // CHECK: [[NULLPTR:%.*]] = llvm.inttoptr [[NULLPTR_INT]] : i32 to !llvm.ptr<3>
+ // CHECK: [[DST:%.*]] = llvm.select {{.*}}, {{.*}}, [[NULLPTR]] : i1, !llvm.ptr<3>
+ // CHECK: rocdl.global.load.async.to.lds.b128 {{.*}}, [[DST]]
+ amdgpu.global_load_async_to_lds %global[%c0, %c0], %alloc[%c0, %c0], %mask
+ : vector<4xf32>, memref<128x72xf32, #gpu.address_space<global>>,
+ memref<64x64xf32, #gpu.address_space<workgroup>>
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 589e7dd0a652d..6f4dd486610cc 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -682,10 +682,11 @@ func.func @gather_to_lds_0d(%mem1 : memref<f16>, %smem1 : memref<f16, #gpu.addre
}
// CHECK-LABEL: func @global_load_async_to_lds
-func.func @global_load_async_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf32, #gpu.address_space<global>>, %mem2 : memref<32x32xf32, #gpu.address_space<global>>, %smem1 : memref<32xf32, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf32, #gpu.address_space<workgroup>>) {
+func.func @global_load_async_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf32, #gpu.address_space<global>>, %mem2 : memref<32x32xf32, #gpu.address_space<global>>, %smem1 : memref<32xf32, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf32, #gpu.address_space<workgroup>>, %mask : i1) {
// CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
// CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
// CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+ // CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
amdgpu.global_load_async_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2]
: f32, memref<32x32xf32, #gpu.address_space<global>>,
memref<32x32xf32, #gpu.address_space<workgroup>>
@@ -695,6 +696,9 @@ func.func @global_load_async_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref
amdgpu.global_load_async_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2]
: vector<2xf32>, memref<32x32xf32, #gpu.address_space<global>>,
memref<32x32xf32, #gpu.address_space<workgroup>>
+ amdgpu.global_load_async_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2], %mask
+ : f32, memref<32x32xf32, #gpu.address_space<global>>,
+ memref<32x32xf32, #gpu.address_space<workgroup>>
func.return
}
More information about the Mlir-commits
mailing list