[Mlir-commits] [mlir] [mlir][amdgpu] wrapper for gfx1250 async global load to lds intrinsics (PR #189279)
Eric Feng
llvmlistbot at llvm.org
Sun Mar 29 12:36:03 PDT 2026
https://github.com/efric updated https://github.com/llvm/llvm-project/pull/189279
>From 65be7dfa0620c70fea6b2596a40efe408ac4cbe6 Mon Sep 17 00:00:00 2001
From: Eric Feng <Eric.Feng at amd.com>
Date: Sun, 29 Mar 2026 12:29:28 -0700
Subject: [PATCH] amdgpu global load async to lds bN
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
---
.../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 46 ++++++++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 67 +++++++++++++++++
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 31 ++++++++
.../Conversion/AMDGPUToROCDL/gfx1250.mlir | 73 +++++++++++++++++++
mlir/test/Dialect/AMDGPU/invalid.mlir | 35 +++++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 26 +++++++
6 files changed, 278 insertions(+)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 3eb039305904f..7481a162072c8 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1295,6 +1295,52 @@ def AMDGPU_GatherToLDSOp :
let hasCanonicalizer = 1;
}
+def AMDGPU_GlobalLoadAsyncToLDSOp :
+ AMDGPU_Op<"global_load_async_to_lds", [AttrSizedOperandSegments]>,
+ Arguments<(ins
+ Arg<AnyMemRef, "global memory to load from", [MemRead]>:$src,
+ Variadic<Index>:$srcIndices,
+ Arg<AnyMemRef, "LDS memory to write to", [MemWrite]>:$dst,
+ Variadic<Index>:$dstIndices,
+ TypeAttr:$transferType
+ )>,
+ Results<(outs)> {
+ let summary = "MLIR wrapper for async global load to lds instructions";
+ let description = [{
+ AMDGPU wrapper for global.load.async.to.lds` instructions, which performs
+ asynchronous load of data from global memory into LDS while bypassing VGPRs.
+
+ * `$src`: global memory memref to read from (global addrspace only, no fat buffer).
+ * `$srcIndices`: indices into `$src` for this thread's global read location.
+ * `$dst`: LDS memref to write to (workgroup addrspace).
+ * `$dstIndices`: indices into `$dst` for this thread's LDS write location.
+ * `$transferType`: type of data to be transferred. Must be 8, 32, 64 or 128 bit scalar
+ or vector type.
+
+ Note: only supported on gfx1250 and later.
+
+ Examples:
+ ```mlir
+ amdgpu.global_load_async_to_lds %src[%i, %j], %dst[%k, %l]
+ : f32, memref<128x64xf32, #gpu.address_space<global>>,
+ memref<64x64xf32, #gpu.address_space<workgroup>>
+
+ amdgpu.global_load_async_to_lds %src[%i, %j], %dst[%k, %l]
+ : vector<4xf32>, memref<128x64xf32, #gpu.address_space<global>>,
+ memref<64x64xf32, #gpu.address_space<workgroup>>
+
+ amdgpu.global_load_async_to_lds %src[%i], %dst[%j]
+ : i8, memref<512xi8, #gpu.address_space<global>>,
+ memref<256xi8, #gpu.address_space<workgroup>>
+ ```
+ }];
+ let assemblyFormat = [{
+ $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]`
+ attr-dict `:` $transferType `,` type($src) `,` type($dst)
+ }];
+ let hasVerifier = 1;
+}
+
def AMDGPU_TransposeLoadOp :
AMDGPU_Op<"transpose_load", [SameVariadicOperandSize]>,
Arguments<(ins Arg<AnyMemRef, "buffer to transpose load from", [MemRead]>:$src, Variadic<Index>:$srcIndices)>,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 14c12f5a787a6..f12468a520953 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2074,6 +2074,72 @@ struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
}
};
+struct GlobalLoadAsyncToLDSOpLowering
+ : public ConvertOpToLLVMPattern<GlobalLoadAsyncToLDSOp> {
+ GlobalLoadAsyncToLDSOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<GlobalLoadAsyncToLDSOp>(converter),
+ chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(GlobalLoadAsyncToLDSOp op,
+ GlobalLoadAsyncToLDSOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1250)
+ return op.emitOpError(
+ "global_load_async_to_lds is only supported on gfx1250+");
+
+ Location loc = op.getLoc();
+ auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+ auto dstMemRefType = cast<MemRefType>(op.getDst().getType());
+
+ Type transferType = op.getTransferType();
+ int transferBits =
+ isa<VectorType>(transferType)
+ ? cast<VectorType>(transferType).getNumElements() *
+ cast<VectorType>(transferType).getElementTypeBitWidth()
+ : transferType.getIntOrFloatBitWidth();
+
+ Value srcPtr = getStridedElementPtr(rewriter, loc, srcMemRefType,
+ adaptor.getSrc(),
+ adaptor.getSrcIndices());
+ Value dstPtr = getStridedElementPtr(rewriter, loc, dstMemRefType,
+ adaptor.getDst(),
+ adaptor.getDstIndices());
+
+ auto offset = rewriter.getI32IntegerAttr(0);
+ auto aux = rewriter.getI32IntegerAttr(0);
+
+ switch (transferBits) {
+ case 8:
+ rewriter.replaceOpWithNewOp<ROCDL::GlobalLoadAsyncToLDSB8Op>(
+ op, srcPtr, dstPtr, offset, aux, ArrayAttr{}, ArrayAttr{},
+ ArrayAttr{});
+ break;
+ case 32:
+ rewriter.replaceOpWithNewOp<ROCDL::GlobalLoadAsyncToLDSB32Op>(
+ op, srcPtr, dstPtr, offset, aux, ArrayAttr{}, ArrayAttr{},
+ ArrayAttr{});
+ break;
+ case 64:
+ rewriter.replaceOpWithNewOp<ROCDL::GlobalLoadAsyncToLDSB64Op>(
+ op, srcPtr, dstPtr, offset, aux, ArrayAttr{}, ArrayAttr{},
+ ArrayAttr{});
+ break;
+ case 128:
+ rewriter.replaceOpWithNewOp<ROCDL::GlobalLoadAsyncToLDSB128Op>(
+ op, srcPtr, dstPtr, offset, aux, ArrayAttr{}, ArrayAttr{},
+ ArrayAttr{});
+ break;
+ default:
+ return op.emitOpError("unsupported transfer width");
+ }
+ return success();
+ }
+};
+
namespace {
struct ExtPackedFp8OpLowering final
: public ConvertOpToLLVMPattern<ExtPackedFp8Op> {
@@ -4076,6 +4142,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ScaledExtPackedMatrixOpLowering, ScaledExtPackedOpLowering,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
+ GlobalLoadAsyncToLDSOpLowering,
TransposeLoadOpLowering, AMDGPUPermlaneLowering,
AMDGPUMakeDmaBaseLowering<MakeDmaBaseOp>,
AMDGPUMakeDmaBaseLowering<MakeGatherDmaBaseOp>,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index b715f4ab93231..70a9f20d659a0 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -943,6 +943,37 @@ void GatherToLDSOp::getCanonicalizationPatterns(RewritePatternSet &results,
results.add<FoldGatherToLDSOfCast>(context);
}
+//===----------------------------------------------------------------------===//
+// GlobalLoadAsyncToLDSOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult GlobalLoadAsyncToLDSOp::verify() {
+ MemRefType srcType = cast<MemRefType>(getSrc().getType());
+ MemRefType dstType = cast<MemRefType>(getDst().getType());
+
+ if (srcType.getElementType() != dstType.getElementType())
+ return emitOpError("source and destination element types must match");
+
+ Type transferType = getTransferType();
+ int transferSize;
+ if (auto vectorTransfer = dyn_cast<VectorType>(transferType)) {
+ transferSize = vectorTransfer.getNumElements() *
+ vectorTransfer.getElementTypeBitWidth();
+ } else {
+ transferSize = transferType.getIntOrFloatBitWidth();
+ }
+ if (!llvm::is_contained({8, 32, 64, 128}, transferSize))
+ return emitOpError("transfer type size must be 8, 32, 64, or 128 bits");
+
+ if (!hasGlobalMemorySpace(srcType.getMemorySpace()))
+ return emitOpError("source memory address space must be global");
+
+ if (!hasWorkgroupMemorySpace(dstType.getMemorySpace()))
+ return emitOpError("destination memory address space must be Workgroup");
+
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// TransposeLoadOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index f7c8145e43152..aa306d82399e8 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -941,3 +941,76 @@ func.func @ds_barrier_state_phase_parity(%state: !amdgpu.ds_barrier_state) -> i1
%parity = amdgpu.ds_barrier_state_phase_parity %state : !amdgpu.ds_barrier_state -> i1
func.return %parity : i1
}
+
+// -----
+// global_load_async_to_lds_bN
+
+// CHECK-LABEL: func @global_load_async_to_lds_b32
+func.func @global_load_async_to_lds_b32(
+ %global : memref<128x72xf32, #gpu.address_space<global>>) {
+ %c0 = arith.constant 0 : index
+ %c12 = arith.constant 12 : index
+ %c32 = arith.constant 32 : index
+ %alloc = memref.alloc() : memref<64x64xf32, #gpu.address_space<workgroup>>
+ // CHECK: rocdl.global.load.async.to.lds.b32
+ amdgpu.global_load_async_to_lds %global[%c12, %c0], %alloc[%c32, %c0]
+ : f32, memref<128x72xf32, #gpu.address_space<global>>,
+ memref<64x64xf32, #gpu.address_space<workgroup>>
+ func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @global_load_async_to_lds_b8
+func.func @global_load_async_to_lds_b8(
+ %global : memref<128x72xi8, #gpu.address_space<global>>) {
+ %c0 = arith.constant 0 : index
+ %alloc = memref.alloc() : memref<64x64xi8, #gpu.address_space<workgroup>>
+ // CHECK: rocdl.global.load.async.to.lds.b8
+ amdgpu.global_load_async_to_lds %global[%c0, %c0], %alloc[%c0, %c0]
+ : i8, memref<128x72xi8, #gpu.address_space<global>>,
+ memref<64x64xi8, #gpu.address_space<workgroup>>
+ func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @global_load_async_to_lds_b64
+func.func @global_load_async_to_lds_b64(
+ %global : memref<128x72xf32, #gpu.address_space<global>>) {
+ %c0 = arith.constant 0 : index
+ %alloc = memref.alloc() : memref<64x64xf32, #gpu.address_space<workgroup>>
+ // CHECK: rocdl.global.load.async.to.lds.b64
+ amdgpu.global_load_async_to_lds %global[%c0, %c0], %alloc[%c0, %c0]
+ : vector<2xf32>, memref<128x72xf32, #gpu.address_space<global>>,
+ memref<64x64xf32, #gpu.address_space<workgroup>>
+ func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @global_load_async_to_lds_b128
+func.func @global_load_async_to_lds_b128(
+ %global : memref<128x72xf32, #gpu.address_space<global>>) {
+ %c0 = arith.constant 0 : index
+ %alloc = memref.alloc() : memref<64x64xf32, #gpu.address_space<workgroup>>
+ // CHECK: rocdl.global.load.async.to.lds.b128
+ amdgpu.global_load_async_to_lds %global[%c0, %c0], %alloc[%c0, %c0]
+ : vector<4xf32>, memref<128x72xf32, #gpu.address_space<global>>,
+ memref<64x64xf32, #gpu.address_space<workgroup>>
+ func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @global_load_async_to_lds_dynamic_indices
+func.func @global_load_async_to_lds_dynamic_indices(
+ %global : memref<512xi32, #gpu.address_space<global>>,
+ %src_idx : index, %dst_idx : index) {
+ %alloc = memref.alloc() : memref<256xi32, #gpu.address_space<workgroup>>
+ // CHECK: rocdl.global.load.async.to.lds.b32
+ amdgpu.global_load_async_to_lds %global[%src_idx], %alloc[%dst_idx]
+ : i32, memref<512xi32, #gpu.address_space<global>>,
+ memref<256xi32, #gpu.address_space<workgroup>>
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index d1bb43e5587a6..28970b69b10aa 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -333,6 +333,41 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
// -----
+func.func @global_load_async_to_lds_non_lds(%idx1 : index,
+ %mem1 : memref<32xf32, #gpu.address_space<global>>,
+ %mem2 : memref<32xf32>) {
+ // expected-error at +1 {{'amdgpu.global_load_async_to_lds' op destination memory address space must be Workgroup}}
+ amdgpu.global_load_async_to_lds %mem1[%idx1], %mem2[%idx1]
+ : f32, memref<32xf32, #gpu.address_space<global>>, memref<32xf32>
+ func.return
+}
+
+// -----
+
+func.func @global_load_async_to_lds_bad_size_16bit(%idx1 : index,
+ %mem1 : memref<32xf16, #gpu.address_space<global>>,
+ %mem2 : memref<32xf16, #gpu.address_space<workgroup>>) {
+ // expected-error at +1 {{'amdgpu.global_load_async_to_lds' op transfer type size must be 8, 32, 64, or 128 bits}}
+ amdgpu.global_load_async_to_lds %mem1[%idx1], %mem2[%idx1]
+ : f16, memref<32xf16, #gpu.address_space<global>>,
+ memref<32xf16, #gpu.address_space<workgroup>>
+ func.return
+}
+
+// -----
+
+func.func @global_load_async_to_lds_src_not_global(%idx1 : index,
+ %mem1 : memref<32xf32, #gpu.address_space<workgroup>>,
+ %mem2 : memref<32xf32, #gpu.address_space<workgroup>>) {
+ // expected-error at +1 {{'amdgpu.global_load_async_to_lds' op source memory address space must be global}}
+ amdgpu.global_load_async_to_lds %mem1[%idx1], %mem2[%idx1]
+ : f32, memref<32xf32, #gpu.address_space<workgroup>>,
+ memref<32xf32, #gpu.address_space<workgroup>>
+ func.return
+}
+
+// -----
+
func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
// expected-error at +1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
%0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index a4d4c54bd041c..589e7dd0a652d 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -681,6 +681,32 @@ func.func @gather_to_lds_0d(%mem1 : memref<f16>, %smem1 : memref<f16, #gpu.addre
func.return
}
+// CHECK-LABEL: func @global_load_async_to_lds
+func.func @global_load_async_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf32, #gpu.address_space<global>>, %mem2 : memref<32x32xf32, #gpu.address_space<global>>, %smem1 : memref<32xf32, #gpu.address_space<workgroup>>, %smem2 : memref<32x32xf32, #gpu.address_space<workgroup>>) {
+ // CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+ // CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+ // CHECK: amdgpu.global_load_async_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}]
+ amdgpu.global_load_async_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2]
+ : f32, memref<32x32xf32, #gpu.address_space<global>>,
+ memref<32x32xf32, #gpu.address_space<workgroup>>
+ amdgpu.global_load_async_to_lds %mem1[%idx1], %smem2[%idx1, %idx2]
+ : f32, memref<32xf32, #gpu.address_space<global>>,
+ memref<32x32xf32, #gpu.address_space<workgroup>>
+ amdgpu.global_load_async_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2]
+ : vector<2xf32>, memref<32x32xf32, #gpu.address_space<global>>,
+ memref<32x32xf32, #gpu.address_space<workgroup>>
+ func.return
+}
+
+// CHECK-LABEL: func @global_load_async_to_lds_0d
+func.func @global_load_async_to_lds_0d(%mem1 : memref<f32, #gpu.address_space<global>>, %smem1 : memref<f32, #gpu.address_space<workgroup>>) {
+ // CHECK: amdgpu.global_load_async_to_lds %{{.*}}[], %{{.*}}[]
+ amdgpu.global_load_async_to_lds %mem1[], %smem1[]
+ : f32, memref<f32, #gpu.address_space<global>>,
+ memref<f32, #gpu.address_space<workgroup>>
+ func.return
+}
+
// CHECK-LABEL: func @memory_counter_wait
func.func @memory_counter_wait() {
// CHECK: amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4) tensor(5)
More information about the Mlir-commits
mailing list