[Mlir-commits] [mlir] [MLIR][AMDGPU] Added l2-prefetch op to AMDGPU (PR #188457)
Ravil Dorozhinskii
llvmlistbot at llvm.org
Thu Mar 26 05:55:29 PDT 2026
https://github.com/ravil-mobile updated https://github.com/llvm/llvm-project/pull/188457
>From e254bb6c801a8ce81ee9fd153e20ec126fcdfe68 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Mon, 23 Mar 2026 17:34:46 +0000
Subject: [PATCH 1/6] [MLIR][AMDGPU] Added l2-prefetch op to AMDGPU
---
.../mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td | 3 ++
.../mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td | 21 ++++++++
.../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 31 +++++++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 54 ++++++++++++++++++-
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 30 +++++++++++
.../AMDGPUToROCDL/global-prefetch.mlir | 15 ++++++
mlir/test/Dialect/AMDGPU/invalid.mlir | 54 +++++++++++++++++++
7 files changed, 206 insertions(+), 2 deletions(-)
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
index c862fb2fc5a3a..0e4ab8d5b6dc5 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
@@ -47,4 +47,7 @@ def AMDGPU_SchedBarrierOpOptAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_SchedBarrierO
def AMDGPU_MFMAPermBAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_MFMAPermB,
"mfma_perm_b">;
+def AMDGPU_TemporalHintAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_TemporalHint,
+ "temporal_hint">;
+
#endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
index 4ec7cb3cd7307..68bae3d255447 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
@@ -80,4 +80,25 @@ def AMDGPU_MFMAPermB : I32Enum<"MFMAPermB",
let cppNamespace = "::mlir::amdgpu";
}
+def AMDGPU_TemporalHint : I32Enum<"TemporalHint",
+ "AMDGPU-specific prefetch temporal hints. "
+ "RT - regular temporal for both near and far caches; "
+ "NT - non-temporal for both near and far caches; "
+ "HT - high-priority temporal for both near and far caches; "
+ "LU - last-use; "
+ "NT_RT - non-temporal for near cache(s) and regular for far caches; "
+ "RT_NT - regular for near cache(s) and non-temporal for far caches; "
+ "NT_HT - non-temporal for near cache(s) and high-priority temporal for far caches; ",
+ [
+ I32EnumAttrCase<"RT", 0>,
+ I32EnumAttrCase<"NT", 1>,
+ I32EnumAttrCase<"HT", 2>,
+ I32EnumAttrCase<"LU", 3>,
+ I32EnumAttrCase<"NT_RT", 4>,
+ I32EnumAttrCase<"RT_NT", 5>,
+ I32EnumAttrCase<"NT_HT", 6>
+ ]> {
+ let cppNamespace = "::mlir::amdgpu";
+}
+
#endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUENUMS_TD
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 3eb039305904f..308814f5a2ae8 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1952,4 +1952,35 @@ def AMDGPU_DsBarrierStatePhaseParity :
}];
}
+def AMDGPU_GlobalPrefetchOp :
+ AMDGPU_Op<"global_prefetch", [MemoryEffects<[MemWrite, MemRead]>]>,
+ Arguments<(ins AnyMemRef:$src,
+ Variadic<I64>:$indices,
+ AMDGPU_TemporalHintAttr:$temporalHint,
+ UnitAttr:$speculative)>,
+ Results<(outs)> {
+
+ let summary = "Prefetch data to caches.";
+ let description = [{
+ Prefetches a cache line to high-level caches using the aligned address of
+ the source `memref` and an offset provided by the indices of the element
+ containing the cache line. This provides temporal hints (e.g., regular
+ or high-priority). Note that out-of-bounds access is allowed in
+ speculative mode. Ensure the source `memref` is in address space `1`.
+
+ This operation was introduced in gfx1250.
+
+ Example:
+ ```mlir
+ amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, 1>
+ ```
+ }];
+
+ let assemblyFormat = [{
+ $src `[` $indices `]` $temporalHint (`speculative` $speculative^)? attr-dict `:` qualified(type($src))
+ }];
+
+ let hasVerifier = 1;
+}
+
#endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUOPS_TD
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 14c12f5a787a6..9667a081d1ea1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3950,6 +3950,56 @@ struct AMDGPUTensorLoadStoreOpLowering
}
};
+struct GlobalPrefetchOpLowering
+ : public ConvertOpToLLVMPattern<GlobalPrefetchOp> {
+ GlobalPrefetchOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<GlobalPrefetchOp>(converter), chipset(chipset) {}
+
+ LogicalResult
+ matchAndRewrite(GlobalPrefetchOp op, GlobalPrefetchOpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1250)
+ return op->emitOpError("is only supported on gfx1250+");
+
+ const TemporalHint hint = op.getTemporalHint();
+ const bool isSpeculative = op.getSpeculative();
+
+ int32_t llvmScopeValue = static_cast<int32_t>(hint);
+ if ((hint == TemporalHint::RT) || (hint == TemporalHint::HT))
+ llvmScopeValue = isSpeculative ? llvmScopeValue : llvmScopeValue | 1;
+
+ IntegerAttr scopeAttr = rewriter.getI32IntegerAttr(llvmScopeValue);
+
+ ValueRange indices = adaptor.getIndices();
+ Value memRef = adaptor.getSrc();
+ MemRefDescriptor descriptor(memRef);
+ Location loc = op->getLoc();
+ Value offset =
+ LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), 0);
+ for (size_t i = 0; i < indices.size(); ++i) {
+ Value stride = descriptor.stride(rewriter, loc, i);
+ Value mulOp = LLVM::MulOp::create(rewriter, loc, rewriter.getI64Type(),
+ stride, indices[i]);
+ offset = LLVM::AddOp::create(rewriter, loc, rewriter.getI64Type(), offset,
+ mulOp);
+ }
+
+ Value basePtr = descriptor.alignedPtr(rewriter, loc);
+ Type elemTy = op.getSrc().getType().getElementType();
+ Type llvmElemTy = getTypeConverter()->convertType(elemTy);
+ Value prefetchPtr = LLVM::GEPOp::create(rewriter, loc, basePtr.getType(),
+ llvmElemTy, basePtr, offset);
+ Operation *newOp = ROCDL::GlobalPrefetchOp::create(
+ rewriter, loc, prefetchPtr, scopeAttr, {}, {}, {});
+
+ rewriter.replaceOp(op, newOp);
+ return success();
+ }
+
+private:
+ Chipset chipset;
+};
+
struct ConvertAMDGPUToROCDLPass
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
using Base::Base;
@@ -4086,8 +4136,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
AMDGPUTensorLoadStoreOpLowering<TensorStoreFromLDSOp,
ROCDL::TensorStoreFromLDSOp>,
DsBarrierInitOpLowering, DsBarrierPollStateOpLowering,
- DsAsyncBarrierArriveOpLowering, DsBarrierArriveOpLowering>(converter,
- chipset);
+ DsAsyncBarrierArriveOpLowering, DsBarrierArriveOpLowering,
+ GlobalPrefetchOpLowering>(converter, chipset);
patterns.add<AMDGPUSwizzleBitModeLowering, DsBarrierStatePhaseOpLowering,
DsBarrierStatePendingCountOpLowering,
DsBarrierStateInitCountOpLowering,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index b715f4ab93231..ebbdbf0b07a55 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1302,5 +1302,35 @@ LogicalResult DsBarrierArriveOp::verify() {
return verifyDsBarrierOpCommon(*this);
}
+//===----------------------------------------------------------------------===//
+// GlobalPrefetchOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult GlobalPrefetchOp::verify() {
+ auto src = cast<MemRefType>(getSrc().getType());
+
+ const unsigned memorySpace = src.getMemorySpaceAsInt();
+ if (memorySpace != 1)
+ return this->emitOpError("the source must reside in address space `1`");
+
+ ArrayRef<int64_t> srcShape = src.getShape();
+ const size_t numIndices = getIndices().size();
+ if (srcShape.size() != numIndices)
+ return this->emitOpError(
+ "the number of indices must match the source shape size");
+
+ const TemporalHint temporalHint = getTemporalHint();
+ const bool isSpeculative = getSpeculative();
+ if (temporalHint == TemporalHint::NT)
+ return this->emitOpError("does not support NT mode");
+ if ((temporalHint == TemporalHint::NT_RT) ||
+ (temporalHint == TemporalHint::RT_NT) ||
+ (temporalHint == TemporalHint::NT_HT)) {
+ if (!isSpeculative)
+ return this->emitOpError("operates only in the speculative mode");
+ }
+ return success();
+}
+
#define GET_OP_CLASSES
#include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc"
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
new file mode 100644
index 0000000000000..f8e9db5730b6b
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --split-input-file --verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @glb_prefetch0
+func.func @glb_prefetch0(%src : memref<64x64xf16, 1>, %i : i64, %j : i64) {
+ // CHECK: rocdl.global.prefetch %{{.*}}, scope 0 : !llvm.ptr<1>
+ amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, 1>
+ func.return
+}
+
+// CHECK-LABEL: @glb_prefetch1
+func.func @glb_prefetch1(%src : memref<64x64xf16, 1>, %i : i64, %j : i64) {
+ // CHECK: rocdl.global.prefetch %{{.*}}, scope 3 : !llvm.ptr<1>
+ amdgpu.global_prefetch %src[%i, %j] HT : memref<64x64xf16, 1>
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index d1bb43e5587a6..595c00c2c1f4c 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -660,3 +660,57 @@ func.func @sparse_wmma_i4_requires_equal_length_wave64(%a: vector<8xi4>, %b: vec
%d = amdgpu.sparse_wmma 16x16x32 %a * %b + %c sparse(%idx : vector<4xi8>) {wave64} : vector<8xi4>, vector<16xi4>, vector<4xi32>
func.return %d : vector<4xi32>
}
+
+// -----
+
+// GlobalPrefetchOp: source must reside in address space 1
+func.func @global_prefetch_wrong_address_space(%src: memref<64x64xf16>, %i: i64, %j: i64) {
+ // expected-error at +1 {{'amdgpu.global_prefetch' op the source must reside in address space `1`}}
+ amdgpu.global_prefetch %src[%i, %j] RT : memref<64x64xf16>
+ func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: number of indices must match source shape rank
+func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, 1>, %i: i64) {
+ // expected-error at +1 {{'amdgpu.global_prefetch' op the number of indices must match the source shape size}}
+ amdgpu.global_prefetch %src[%i] RT : memref<64x64xf16, 1>
+ func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: NT temporal hint is not supported
+func.func @global_prefetch_nt_mode(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+ // expected-error at +1 {{'amdgpu.global_prefetch' op does not support NT mode}}
+ amdgpu.global_prefetch %src[%i, %j] NT : memref<64x64xf16, 1>
+ func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: NT_RT requires speculative mode
+func.func @global_prefetch_nt_rt_not_speculative(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+ // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
+ amdgpu.global_prefetch %src[%i, %j] NT_RT : memref<64x64xf16, 1>
+ func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: RT_NT requires speculative mode
+func.func @global_prefetch_rt_nt_not_speculative(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+ // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
+ amdgpu.global_prefetch %src[%i, %j] RT_NT : memref<64x64xf16, 1>
+ func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: NT_HT requires speculative mode
+func.func @global_prefetch_nt_ht_not_speculative(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+ // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
+ amdgpu.global_prefetch %src[%i, %j] NT_HT : memref<64x64xf16, 1>
+ func.return
+}
>From dd2072c635e729e6e2320beba09ed8511e5d40e0 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Wed, 25 Mar 2026 13:55:04 +0000
Subject: [PATCH 2/6] Addressed comments under PR#188457
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 +++---
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 11 ++++++-----
2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 9667a081d1ea1..9ee91b44e9894 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3965,7 +3965,7 @@ struct GlobalPrefetchOpLowering
const bool isSpeculative = op.getSpeculative();
int32_t llvmScopeValue = static_cast<int32_t>(hint);
- if ((hint == TemporalHint::RT) || (hint == TemporalHint::HT))
+ if (hint == TemporalHint::RT || hint == TemporalHint::HT)
llvmScopeValue = isSpeculative ? llvmScopeValue : llvmScopeValue | 1;
IntegerAttr scopeAttr = rewriter.getI32IntegerAttr(llvmScopeValue);
@@ -3976,10 +3976,10 @@ struct GlobalPrefetchOpLowering
Location loc = op->getLoc();
Value offset =
LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), 0);
- for (size_t i = 0; i < indices.size(); ++i) {
+ for (auto [i, index] : llvm::enumerate(indices)) {
Value stride = descriptor.stride(rewriter, loc, i);
Value mulOp = LLVM::MulOp::create(rewriter, loc, rewriter.getI64Type(),
- stride, indices[i]);
+ stride, index);
offset = LLVM::AddOp::create(rewriter, loc, rewriter.getI64Type(), offset,
mulOp);
}
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index ebbdbf0b07a55..fd5c5e39e74cc 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1323,11 +1323,12 @@ LogicalResult GlobalPrefetchOp::verify() {
const bool isSpeculative = getSpeculative();
if (temporalHint == TemporalHint::NT)
return this->emitOpError("does not support NT mode");
- if ((temporalHint == TemporalHint::NT_RT) ||
- (temporalHint == TemporalHint::RT_NT) ||
- (temporalHint == TemporalHint::NT_HT)) {
- if (!isSpeculative)
- return this->emitOpError("operates only in the speculative mode");
+
+ if (llvm::is_contained(
+ {TemporalHint::NT_RT, TemporalHint::RT_NT, TemporalHint::NT_HT},
+ temporalHint) &&
+ !isSpeculative) {
+ return this->emitOpError("operates only in the speculative mode");
}
return success();
}
>From ad44a93b82e34a1af695431e88f8d32fb9f48cd3 Mon Sep 17 00:00:00 2001
From: Ravil Dorozhinskii <ravil.aviva.com at gmail.com>
Date: Wed, 25 Mar 2026 17:24:20 +0100
Subject: [PATCH 3/6] Update mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
Co-authored-by: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 308814f5a2ae8..0e99a56a2fa09 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1966,7 +1966,7 @@ def AMDGPU_GlobalPrefetchOp :
the source `memref` and an offset provided by the indices of the element
containing the cache line. This provides temporal hints (e.g., regular
or high-priority). Note that out-of-bounds access is allowed in
- speculative mode. Ensure the source `memref` is in address space `1`.
+ speculative mode. The provided memref must be in the global address space (`#gpu.address_space<global>` or 1).
This operation was introduced in gfx1250.
>From 5c338531ac30e69ecbf5aeb0e4ca815075a4e643 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Wed, 25 Mar 2026 19:49:27 +0000
Subject: [PATCH 4/6] Addressed comments under PR#188457
---
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 9 ++++++---
mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir | 8 ++++----
2 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index fd5c5e39e74cc..4bb0ddd8bb658 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1309,9 +1309,12 @@ LogicalResult DsBarrierArriveOp::verify() {
LogicalResult GlobalPrefetchOp::verify() {
auto src = cast<MemRefType>(getSrc().getType());
- const unsigned memorySpace = src.getMemorySpaceAsInt();
- if (memorySpace != 1)
- return this->emitOpError("the source must reside in address space `1`");
+ if (auto spaceAttr = dyn_cast<gpu::AddressSpaceAttr>(src.getMemorySpace())) {
+ if (spaceAttr.getValue() != gpu::AddressSpace::Global)
+ return this->emitOpError(
+ "the source must reside in global address space");
+ } else
+ return this->emitOpError("requires gpu address space attrubute");
ArrayRef<int64_t> srcShape = src.getShape();
const size_t numIndices = getIndices().size();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
index f8e9db5730b6b..84906ae4dab57 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
@@ -1,15 +1,15 @@
// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --split-input-file --verify-diagnostics | FileCheck %s
// CHECK-LABEL: @glb_prefetch0
-func.func @glb_prefetch0(%src : memref<64x64xf16, 1>, %i : i64, %j : i64) {
+func.func @glb_prefetch0(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
// CHECK: rocdl.global.prefetch %{{.*}}, scope 0 : !llvm.ptr<1>
- amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, 1>
+ amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, #gpu.address_space<global>>
func.return
}
// CHECK-LABEL: @glb_prefetch1
-func.func @glb_prefetch1(%src : memref<64x64xf16, 1>, %i : i64, %j : i64) {
+func.func @glb_prefetch1(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
// CHECK: rocdl.global.prefetch %{{.*}}, scope 3 : !llvm.ptr<1>
- amdgpu.global_prefetch %src[%i, %j] HT : memref<64x64xf16, 1>
+ amdgpu.global_prefetch %src[%i, %j] HT : memref<64x64xf16, #gpu.address_space<global>>
func.return
}
>From 49ec5dfaadf31e1ae2b3a6bf4f82285a4deb8a86 Mon Sep 17 00:00:00 2001
From: Ravil Dorozhinskii <ravil.aviva.com at gmail.com>
Date: Thu, 26 Mar 2026 13:20:28 +0100
Subject: [PATCH 5/6] Update
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
Co-authored-by: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
---
.../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 16 ++--------------
1 file changed, 2 insertions(+), 14 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 9ee91b44e9894..90247982ca203 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3974,20 +3974,8 @@ struct GlobalPrefetchOpLowering
Value memRef = adaptor.getSrc();
MemRefDescriptor descriptor(memRef);
Location loc = op->getLoc();
- Value offset =
- LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), 0);
- for (auto [i, index] : llvm::enumerate(indices)) {
- Value stride = descriptor.stride(rewriter, loc, i);
- Value mulOp = LLVM::MulOp::create(rewriter, loc, rewriter.getI64Type(),
- stride, index);
- offset = LLVM::AddOp::create(rewriter, loc, rewriter.getI64Type(), offset,
- mulOp);
- }
-
- Value basePtr = descriptor.alignedPtr(rewriter, loc);
- Type elemTy = op.getSrc().getType().getElementType();
- Type llvmElemTy = getTypeConverter()->convertType(elemTy);
- Value prefetchPtr = LLVM::GEPOp::create(rewriter, loc, basePtr.getType(),
+ auto inboundsFlags = isSpeculative ? LLVM::GEPNoWrapFlags::none : LLVM::GEPNoWrapFlags::inbounds | LLVM::GEPNoWrapFlags::nuw;
+ Value prefetchPtr = getStridedElementPtr(rewriter, loc, adaptor.getSrc(), adaptor.getIndices(), inboundsFlags);
llvmElemTy, basePtr, offset);
Operation *newOp = ROCDL::GlobalPrefetchOp::create(
rewriter, loc, prefetchPtr, scopeAttr, {}, {}, {});
>From 226e6311922d2904e721a04eb4001f2ed518324a Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Thu, 26 Mar 2026 12:54:24 +0000
Subject: [PATCH 6/6] Addressed comments under PR#188457
---
.../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 17 +++++++++++++----
mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp | 8 ++------
.../AMDGPUToROCDL/global-prefetch.mlir | 4 +++-
3 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 90247982ca203..3c181fcb1e11d 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3965,7 +3965,12 @@ struct GlobalPrefetchOpLowering
const bool isSpeculative = op.getSpeculative();
int32_t llvmScopeValue = static_cast<int32_t>(hint);
- if (hint == TemporalHint::RT || hint == TemporalHint::HT)
+
+ // Note that only RT and HT can operate in both speculative and
+ // non-speculative modes. The other variants (NT_RT, RT_NT, NT_HT, etc.)
+ // operate only in the speculative mode and, therefore, do not require
+ // toggling the least significant bit for mode changes
+ if (llvm::is_contained({TemporalHint::RT, TemporalHint::HT}, hint))
llvmScopeValue = isSpeculative ? llvmScopeValue : llvmScopeValue | 1;
IntegerAttr scopeAttr = rewriter.getI32IntegerAttr(llvmScopeValue);
@@ -3973,10 +3978,14 @@ struct GlobalPrefetchOpLowering
ValueRange indices = adaptor.getIndices();
Value memRef = adaptor.getSrc();
MemRefDescriptor descriptor(memRef);
+ MemRefType memRefType = op.getSrc().getType();
Location loc = op->getLoc();
- auto inboundsFlags = isSpeculative ? LLVM::GEPNoWrapFlags::none : LLVM::GEPNoWrapFlags::inbounds | LLVM::GEPNoWrapFlags::nuw;
- Value prefetchPtr = getStridedElementPtr(rewriter, loc, adaptor.getSrc(), adaptor.getIndices(), inboundsFlags);
- llvmElemTy, basePtr, offset);
+ auto inboundsFlags = isSpeculative ? LLVM::GEPNoWrapFlags::none
+ : LLVM::GEPNoWrapFlags::inbounds |
+ LLVM::GEPNoWrapFlags::nuw;
+ Value prefetchPtr = getStridedElementPtr(
+ rewriter, loc, memRefType, descriptor, indices, inboundsFlags);
+
Operation *newOp = ROCDL::GlobalPrefetchOp::create(
rewriter, loc, prefetchPtr, scopeAttr, {}, {}, {});
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index 4bb0ddd8bb658..c1dfa101f90c9 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1309,12 +1309,8 @@ LogicalResult DsBarrierArriveOp::verify() {
LogicalResult GlobalPrefetchOp::verify() {
auto src = cast<MemRefType>(getSrc().getType());
- if (auto spaceAttr = dyn_cast<gpu::AddressSpaceAttr>(src.getMemorySpace())) {
- if (spaceAttr.getValue() != gpu::AddressSpace::Global)
- return this->emitOpError(
- "the source must reside in global address space");
- } else
- return this->emitOpError("requires gpu address space attrubute");
+ if (!hasGlobalMemorySpace(src.getMemorySpace()))
+ return this->emitOpError("the source must reside in global address space");
ArrayRef<int64_t> srcShape = src.getShape();
const size_t numIndices = getIndices().size();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
index 84906ae4dab57..d38285743ac5b 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
@@ -2,6 +2,7 @@
// CHECK-LABEL: @glb_prefetch0
func.func @glb_prefetch0(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
+ // CHECK: %[[PTR:.*]] = llvm.getelementptr %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
// CHECK: rocdl.global.prefetch %{{.*}}, scope 0 : !llvm.ptr<1>
amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, #gpu.address_space<global>>
func.return
@@ -9,7 +10,8 @@ func.func @glb_prefetch0(%src : memref<64x64xf16, #gpu.address_space<global>>, %
// CHECK-LABEL: @glb_prefetch1
func.func @glb_prefetch1(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
- // CHECK: rocdl.global.prefetch %{{.*}}, scope 3 : !llvm.ptr<1>
+ // CHECK: %[[PTR:.*]] = llvm.getelementptr inbounds|nuw %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
+ // CHECK: rocdl.global.prefetch %[[PTR]], scope 3 : !llvm.ptr<1>
amdgpu.global_prefetch %src[%i, %j] HT : memref<64x64xf16, #gpu.address_space<global>>
func.return
}
More information about the Mlir-commits
mailing list