[Mlir-commits] [mlir] [MLIR][AMDGPU] Added l2-prefetch op to AMDGPU (PR #188457)

Fri Mar 27 03:47:57 PDT 2026

https://github.com/ravil-mobile updated https://github.com/llvm/llvm-project/pull/188457

>From e254bb6c801a8ce81ee9fd153e20ec126fcdfe68 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Mon, 23 Mar 2026 17:34:46 +0000
Subject: [PATCH 1/9] [MLIR][AMDGPU] Added l2-prefetch op to AMDGPU

---
 .../mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td     |  3 ++
 .../mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td     | 21 ++++++++
 .../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td       | 31 +++++++++++
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 54 ++++++++++++++++++-
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp      | 30 +++++++++++
 .../AMDGPUToROCDL/global-prefetch.mlir        | 15 ++++++
 mlir/test/Dialect/AMDGPU/invalid.mlir         | 54 +++++++++++++++++++
 7 files changed, 206 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
index c862fb2fc5a3a..0e4ab8d5b6dc5 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
@@ -47,4 +47,7 @@ def AMDGPU_SchedBarrierOpOptAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_SchedBarrierO
 def AMDGPU_MFMAPermBAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_MFMAPermB,
   "mfma_perm_b">;
 
+def AMDGPU_TemporalHintAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_TemporalHint,
+  "temporal_hint">;
+
 #endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
index 4ec7cb3cd7307..68bae3d255447 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
@@ -80,4 +80,25 @@ def AMDGPU_MFMAPermB : I32Enum<"MFMAPermB",
   let cppNamespace = "::mlir::amdgpu";
 }
 
+def AMDGPU_TemporalHint : I32Enum<"TemporalHint",
+    "AMDGPU-specific prefetch temporal hints. "
+    "RT - regular temporal for both near and far caches; "
+    "NT - non-temporal for both near and far caches; "
+    "HT - high-priority temporal for both near and far caches; "
+    "LU - last-use; "
+    "NT_RT - non-temporal for near cache(s) and regular for far caches; "
+    "RT_NT - regular for near cache(s) and non-temporal for far caches; "
+    "NT_HT - non-temporal for near cache(s) and high-priority temporal for far caches; ",
+    [
+      I32EnumAttrCase<"RT",    0>,
+      I32EnumAttrCase<"NT",    1>,
+      I32EnumAttrCase<"HT",    2>,
+      I32EnumAttrCase<"LU",    3>,
+      I32EnumAttrCase<"NT_RT", 4>,
+      I32EnumAttrCase<"RT_NT", 5>,
+      I32EnumAttrCase<"NT_HT", 6>
+    ]> {
+  let cppNamespace = "::mlir::amdgpu";
+}
+
 #endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUENUMS_TD
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 3eb039305904f..308814f5a2ae8 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1952,4 +1952,35 @@ def AMDGPU_DsBarrierStatePhaseParity :
   }];
 }
 
+def AMDGPU_GlobalPrefetchOp :
+    AMDGPU_Op<"global_prefetch", [MemoryEffects<[MemWrite, MemRead]>]>,
+    Arguments<(ins AnyMemRef:$src,
+               Variadic<I64>:$indices,
+               AMDGPU_TemporalHintAttr:$temporalHint,
+               UnitAttr:$speculative)>,
+    Results<(outs)> {
+
+  let summary = "Prefetch data to caches.";
+  let description = [{
+    Prefetches a cache line to high-level caches using the aligned address of
+    the source `memref` and an offset provided by the indices of the element
+    containing the cache line. This provides temporal hints (e.g., regular
+    or high-priority). Note that out-of-bounds access is allowed in
+    speculative mode. Ensure the source `memref` is in address space `1`.
+
+    This operation was introduced in gfx1250.
+
+    Example:
+    ```mlir
+    amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, 1>
+    ```
+  }];
+
+  let assemblyFormat = [{
+    $src `[` $indices `]` $temporalHint (`speculative` $speculative^)? attr-dict `:` qualified(type($src))
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUOPS_TD
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 14c12f5a787a6..9667a081d1ea1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3950,6 +3950,56 @@ struct AMDGPUTensorLoadStoreOpLowering
   }
 };
 
+struct GlobalPrefetchOpLowering
+    : public ConvertOpToLLVMPattern<GlobalPrefetchOp> {
+  GlobalPrefetchOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<GlobalPrefetchOp>(converter), chipset(chipset) {}
+
+  LogicalResult
+  matchAndRewrite(GlobalPrefetchOp op, GlobalPrefetchOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (chipset < kGfx1250)
+      return op->emitOpError("is only supported on gfx1250+");
+
+    const TemporalHint hint = op.getTemporalHint();
+    const bool isSpeculative = op.getSpeculative();
+
+    int32_t llvmScopeValue = static_cast<int32_t>(hint);
+    if ((hint == TemporalHint::RT) || (hint == TemporalHint::HT))
+      llvmScopeValue = isSpeculative ? llvmScopeValue : llvmScopeValue | 1;
+
+    IntegerAttr scopeAttr = rewriter.getI32IntegerAttr(llvmScopeValue);
+
+    ValueRange indices = adaptor.getIndices();
+    Value memRef = adaptor.getSrc();
+    MemRefDescriptor descriptor(memRef);
+    Location loc = op->getLoc();
+    Value offset =
+        LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), 0);
+    for (size_t i = 0; i < indices.size(); ++i) {
+      Value stride = descriptor.stride(rewriter, loc, i);
+      Value mulOp = LLVM::MulOp::create(rewriter, loc, rewriter.getI64Type(),
+                                        stride, indices[i]);
+      offset = LLVM::AddOp::create(rewriter, loc, rewriter.getI64Type(), offset,
+                                   mulOp);
+    }
+
+    Value basePtr = descriptor.alignedPtr(rewriter, loc);
+    Type elemTy = op.getSrc().getType().getElementType();
+    Type llvmElemTy = getTypeConverter()->convertType(elemTy);
+    Value prefetchPtr = LLVM::GEPOp::create(rewriter, loc, basePtr.getType(),
+                                            llvmElemTy, basePtr, offset);
+    Operation *newOp = ROCDL::GlobalPrefetchOp::create(
+        rewriter, loc, prefetchPtr, scopeAttr, {}, {}, {});
+
+    rewriter.replaceOp(op, newOp);
+    return success();
+  }
+
+private:
+  Chipset chipset;
+};
+
 struct ConvertAMDGPUToROCDLPass
     : public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
   using Base::Base;
@@ -4086,8 +4136,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
            AMDGPUTensorLoadStoreOpLowering<TensorStoreFromLDSOp,
                                            ROCDL::TensorStoreFromLDSOp>,
            DsBarrierInitOpLowering, DsBarrierPollStateOpLowering,
-           DsAsyncBarrierArriveOpLowering, DsBarrierArriveOpLowering>(converter,
-                                                                      chipset);
+           DsAsyncBarrierArriveOpLowering, DsBarrierArriveOpLowering,
+           GlobalPrefetchOpLowering>(converter, chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering, DsBarrierStatePhaseOpLowering,
                DsBarrierStatePendingCountOpLowering,
                DsBarrierStateInitCountOpLowering,
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index b715f4ab93231..ebbdbf0b07a55 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1302,5 +1302,35 @@ LogicalResult DsBarrierArriveOp::verify() {
   return verifyDsBarrierOpCommon(*this);
 }
 
+//===----------------------------------------------------------------------===//
+// GlobalPrefetchOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult GlobalPrefetchOp::verify() {
+  auto src = cast<MemRefType>(getSrc().getType());
+
+  const unsigned memorySpace = src.getMemorySpaceAsInt();
+  if (memorySpace != 1)
+    return this->emitOpError("the source must reside in address space `1`");
+
+  ArrayRef<int64_t> srcShape = src.getShape();
+  const size_t numIndices = getIndices().size();
+  if (srcShape.size() != numIndices)
+    return this->emitOpError(
+        "the number of indices must match the source shape size");
+
+  const TemporalHint temporalHint = getTemporalHint();
+  const bool isSpeculative = getSpeculative();
+  if (temporalHint == TemporalHint::NT)
+    return this->emitOpError("does not support NT mode");
+  if ((temporalHint == TemporalHint::NT_RT) ||
+      (temporalHint == TemporalHint::RT_NT) ||
+      (temporalHint == TemporalHint::NT_HT)) {
+    if (!isSpeculative)
+      return this->emitOpError("operates only in the speculative mode");
+  }
+  return success();
+}
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc"
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
new file mode 100644
index 0000000000000..f8e9db5730b6b
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --split-input-file --verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @glb_prefetch0
+func.func @glb_prefetch0(%src : memref<64x64xf16, 1>, %i : i64, %j : i64) {
+  // CHECK: rocdl.global.prefetch %{{.*}}, scope 0 : !llvm.ptr<1>
+  amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, 1>
+  func.return
+}
+
+// CHECK-LABEL: @glb_prefetch1
+func.func @glb_prefetch1(%src : memref<64x64xf16, 1>, %i : i64, %j : i64) {
+  // CHECK: rocdl.global.prefetch %{{.*}}, scope 3 : !llvm.ptr<1>
+  amdgpu.global_prefetch %src[%i, %j] HT : memref<64x64xf16, 1>
+  func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index d1bb43e5587a6..595c00c2c1f4c 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -660,3 +660,57 @@ func.func @sparse_wmma_i4_requires_equal_length_wave64(%a: vector<8xi4>, %b: vec
   %d = amdgpu.sparse_wmma 16x16x32 %a * %b + %c sparse(%idx : vector<4xi8>) {wave64} : vector<8xi4>, vector<16xi4>, vector<4xi32>
   func.return %d : vector<4xi32>
 }
+
+// -----
+
+// GlobalPrefetchOp: source must reside in address space 1
+func.func @global_prefetch_wrong_address_space(%src: memref<64x64xf16>, %i: i64, %j: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op the source must reside in address space `1`}}
+  amdgpu.global_prefetch %src[%i, %j] RT : memref<64x64xf16>
+  func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: number of indices must match source shape rank
+func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, 1>, %i: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op the number of indices must match the source shape size}}
+  amdgpu.global_prefetch %src[%i] RT : memref<64x64xf16, 1>
+  func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: NT temporal hint is not supported
+func.func @global_prefetch_nt_mode(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op does not support NT mode}}
+  amdgpu.global_prefetch %src[%i, %j] NT : memref<64x64xf16, 1>
+  func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: NT_RT requires speculative mode
+func.func @global_prefetch_nt_rt_not_speculative(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
+  amdgpu.global_prefetch %src[%i, %j] NT_RT : memref<64x64xf16, 1>
+  func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: RT_NT requires speculative mode
+func.func @global_prefetch_rt_nt_not_speculative(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
+  amdgpu.global_prefetch %src[%i, %j] RT_NT : memref<64x64xf16, 1>
+  func.return
+}
+
+// -----
+
+// GlobalPrefetchOp: NT_HT requires speculative mode
+func.func @global_prefetch_nt_ht_not_speculative(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
+  amdgpu.global_prefetch %src[%i, %j] NT_HT : memref<64x64xf16, 1>
+  func.return
+}

>From dd2072c635e729e6e2320beba09ed8511e5d40e0 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Wed, 25 Mar 2026 13:55:04 +0000
Subject: [PATCH 2/9] Addressed comments under PR#188457

---
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp |  6 +++---
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp            | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 9667a081d1ea1..9ee91b44e9894 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3965,7 +3965,7 @@ struct GlobalPrefetchOpLowering
     const bool isSpeculative = op.getSpeculative();
 
     int32_t llvmScopeValue = static_cast<int32_t>(hint);
-    if ((hint == TemporalHint::RT) || (hint == TemporalHint::HT))
+    if (hint == TemporalHint::RT || hint == TemporalHint::HT)
       llvmScopeValue = isSpeculative ? llvmScopeValue : llvmScopeValue | 1;
 
     IntegerAttr scopeAttr = rewriter.getI32IntegerAttr(llvmScopeValue);
@@ -3976,10 +3976,10 @@ struct GlobalPrefetchOpLowering
     Location loc = op->getLoc();
     Value offset =
         LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), 0);
-    for (size_t i = 0; i < indices.size(); ++i) {
+    for (auto [i, index] : llvm::enumerate(indices)) {
       Value stride = descriptor.stride(rewriter, loc, i);
       Value mulOp = LLVM::MulOp::create(rewriter, loc, rewriter.getI64Type(),
-                                        stride, indices[i]);
+                                        stride, index);
       offset = LLVM::AddOp::create(rewriter, loc, rewriter.getI64Type(), offset,
                                    mulOp);
     }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index ebbdbf0b07a55..fd5c5e39e74cc 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1323,11 +1323,12 @@ LogicalResult GlobalPrefetchOp::verify() {
   const bool isSpeculative = getSpeculative();
   if (temporalHint == TemporalHint::NT)
     return this->emitOpError("does not support NT mode");
-  if ((temporalHint == TemporalHint::NT_RT) ||
-      (temporalHint == TemporalHint::RT_NT) ||
-      (temporalHint == TemporalHint::NT_HT)) {
-    if (!isSpeculative)
-      return this->emitOpError("operates only in the speculative mode");
+
+  if (llvm::is_contained(
+          {TemporalHint::NT_RT, TemporalHint::RT_NT, TemporalHint::NT_HT},
+          temporalHint) &&
+      !isSpeculative) {
+    return this->emitOpError("operates only in the speculative mode");
   }
   return success();
 }

>From ad44a93b82e34a1af695431e88f8d32fb9f48cd3 Mon Sep 17 00:00:00 2001
From: Ravil Dorozhinskii <ravil.aviva.com at gmail.com>
Date: Wed, 25 Mar 2026 17:24:20 +0100
Subject: [PATCH 3/9] Update mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td

Co-authored-by: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 308814f5a2ae8..0e99a56a2fa09 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1966,7 +1966,7 @@ def AMDGPU_GlobalPrefetchOp :
     the source `memref` and an offset provided by the indices of the element
     containing the cache line. This provides temporal hints (e.g., regular
     or high-priority). Note that out-of-bounds access is allowed in
-    speculative mode. Ensure the source `memref` is in address space `1`.
+    speculative mode. The provided memref must be in the global address space (`#gpu.address_space<global>` or 1).
 
     This operation was introduced in gfx1250.
 

>From 5c338531ac30e69ecbf5aeb0e4ca815075a4e643 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Wed, 25 Mar 2026 19:49:27 +0000
Subject: [PATCH 4/9] Addressed comments under PR#188457

---
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp                | 9 ++++++---
 mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir | 8 ++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index fd5c5e39e74cc..4bb0ddd8bb658 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1309,9 +1309,12 @@ LogicalResult DsBarrierArriveOp::verify() {
 LogicalResult GlobalPrefetchOp::verify() {
   auto src = cast<MemRefType>(getSrc().getType());
 
-  const unsigned memorySpace = src.getMemorySpaceAsInt();
-  if (memorySpace != 1)
-    return this->emitOpError("the source must reside in address space `1`");
+  if (auto spaceAttr = dyn_cast<gpu::AddressSpaceAttr>(src.getMemorySpace())) {
+    if (spaceAttr.getValue() != gpu::AddressSpace::Global)
+      return this->emitOpError(
+          "the source must reside in global address space");
+  } else
+    return this->emitOpError("requires gpu address space attrubute");
 
   ArrayRef<int64_t> srcShape = src.getShape();
   const size_t numIndices = getIndices().size();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
index f8e9db5730b6b..84906ae4dab57 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
@@ -1,15 +1,15 @@
 // RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --split-input-file --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: @glb_prefetch0
-func.func @glb_prefetch0(%src : memref<64x64xf16, 1>, %i : i64, %j : i64) {
+func.func @glb_prefetch0(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
   // CHECK: rocdl.global.prefetch %{{.*}}, scope 0 : !llvm.ptr<1>
-  amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, 1>
+  amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
 // CHECK-LABEL: @glb_prefetch1
-func.func @glb_prefetch1(%src : memref<64x64xf16, 1>, %i : i64, %j : i64) {
+func.func @glb_prefetch1(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
   // CHECK: rocdl.global.prefetch %{{.*}}, scope 3 : !llvm.ptr<1>
-  amdgpu.global_prefetch %src[%i, %j] HT : memref<64x64xf16, 1>
+  amdgpu.global_prefetch %src[%i, %j] HT : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }

>From 49ec5dfaadf31e1ae2b3a6bf4f82285a4deb8a86 Mon Sep 17 00:00:00 2001
From: Ravil Dorozhinskii <ravil.aviva.com at gmail.com>
Date: Thu, 26 Mar 2026 13:20:28 +0100
Subject: [PATCH 5/9] Update
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Co-authored-by: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
---
 .../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp   | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 9ee91b44e9894..90247982ca203 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3974,20 +3974,8 @@ struct GlobalPrefetchOpLowering
     Value memRef = adaptor.getSrc();
     MemRefDescriptor descriptor(memRef);
     Location loc = op->getLoc();
-    Value offset =
-        LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), 0);
-    for (auto [i, index] : llvm::enumerate(indices)) {
-      Value stride = descriptor.stride(rewriter, loc, i);
-      Value mulOp = LLVM::MulOp::create(rewriter, loc, rewriter.getI64Type(),
-                                        stride, index);
-      offset = LLVM::AddOp::create(rewriter, loc, rewriter.getI64Type(), offset,
-                                   mulOp);
-    }
-
-    Value basePtr = descriptor.alignedPtr(rewriter, loc);
-    Type elemTy = op.getSrc().getType().getElementType();
-    Type llvmElemTy = getTypeConverter()->convertType(elemTy);
-    Value prefetchPtr = LLVM::GEPOp::create(rewriter, loc, basePtr.getType(),
+    auto inboundsFlags = isSpeculative ? LLVM::GEPNoWrapFlags::none : LLVM::GEPNoWrapFlags::inbounds | LLVM::GEPNoWrapFlags::nuw;
+    Value prefetchPtr = getStridedElementPtr(rewriter, loc, adaptor.getSrc(), adaptor.getIndices(), inboundsFlags);
                                             llvmElemTy, basePtr, offset);
     Operation *newOp = ROCDL::GlobalPrefetchOp::create(
         rewriter, loc, prefetchPtr, scopeAttr, {}, {}, {});

>From 226e6311922d2904e721a04eb4001f2ed518324a Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Thu, 26 Mar 2026 12:54:24 +0000
Subject: [PATCH 6/9] Addressed comments under PR#188457

---
 .../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp  | 17 +++++++++++++----
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp        |  8 ++------
 .../AMDGPUToROCDL/global-prefetch.mlir          |  4 +++-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 90247982ca203..3c181fcb1e11d 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3965,7 +3965,12 @@ struct GlobalPrefetchOpLowering
     const bool isSpeculative = op.getSpeculative();
 
     int32_t llvmScopeValue = static_cast<int32_t>(hint);
-    if (hint == TemporalHint::RT || hint == TemporalHint::HT)
+
+    // Note that only RT and HT can operate in both speculative and
+    // non-speculative modes. The other variants (NT_RT, RT_NT, NT_HT, etc.)
+    // operate only in the speculative mode and, therefore, do not require
+    // toggling the least significant bit for mode changes
+    if (llvm::is_contained({TemporalHint::RT, TemporalHint::HT}, hint))
       llvmScopeValue = isSpeculative ? llvmScopeValue : llvmScopeValue | 1;
 
     IntegerAttr scopeAttr = rewriter.getI32IntegerAttr(llvmScopeValue);
@@ -3973,10 +3978,14 @@ struct GlobalPrefetchOpLowering
     ValueRange indices = adaptor.getIndices();
     Value memRef = adaptor.getSrc();
     MemRefDescriptor descriptor(memRef);
+    MemRefType memRefType = op.getSrc().getType();
     Location loc = op->getLoc();
-    auto inboundsFlags = isSpeculative ? LLVM::GEPNoWrapFlags::none : LLVM::GEPNoWrapFlags::inbounds | LLVM::GEPNoWrapFlags::nuw;
-    Value prefetchPtr = getStridedElementPtr(rewriter, loc, adaptor.getSrc(), adaptor.getIndices(), inboundsFlags);
-                                            llvmElemTy, basePtr, offset);
+    auto inboundsFlags = isSpeculative ? LLVM::GEPNoWrapFlags::none
+                                       : LLVM::GEPNoWrapFlags::inbounds |
+                                             LLVM::GEPNoWrapFlags::nuw;
+    Value prefetchPtr = getStridedElementPtr(
+        rewriter, loc, memRefType, descriptor, indices, inboundsFlags);
+
     Operation *newOp = ROCDL::GlobalPrefetchOp::create(
         rewriter, loc, prefetchPtr, scopeAttr, {}, {}, {});
 
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index 4bb0ddd8bb658..c1dfa101f90c9 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1309,12 +1309,8 @@ LogicalResult DsBarrierArriveOp::verify() {
 LogicalResult GlobalPrefetchOp::verify() {
   auto src = cast<MemRefType>(getSrc().getType());
 
-  if (auto spaceAttr = dyn_cast<gpu::AddressSpaceAttr>(src.getMemorySpace())) {
-    if (spaceAttr.getValue() != gpu::AddressSpace::Global)
-      return this->emitOpError(
-          "the source must reside in global address space");
-  } else
-    return this->emitOpError("requires gpu address space attrubute");
+  if (!hasGlobalMemorySpace(src.getMemorySpace()))
+    return this->emitOpError("the source must reside in global address space");
 
   ArrayRef<int64_t> srcShape = src.getShape();
   const size_t numIndices = getIndices().size();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
index 84906ae4dab57..d38285743ac5b 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
@@ -2,6 +2,7 @@
 
 // CHECK-LABEL: @glb_prefetch0
 func.func @glb_prefetch0(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
+  // CHECK: %[[PTR:.*]] = llvm.getelementptr %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
   // CHECK: rocdl.global.prefetch %{{.*}}, scope 0 : !llvm.ptr<1>
   amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, #gpu.address_space<global>>
   func.return
@@ -9,7 +10,8 @@ func.func @glb_prefetch0(%src : memref<64x64xf16, #gpu.address_space<global>>, %
 
 // CHECK-LABEL: @glb_prefetch1
 func.func @glb_prefetch1(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
-  // CHECK: rocdl.global.prefetch %{{.*}}, scope 3 : !llvm.ptr<1>
+  // CHECK: %[[PTR:.*]] = llvm.getelementptr inbounds|nuw %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
+  // CHECK: rocdl.global.prefetch %[[PTR]], scope 3 : !llvm.ptr<1>
   amdgpu.global_prefetch %src[%i, %j] HT : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }

>From c637ecf9d39b149c47fab0ac65eb9895109c5871 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Thu, 26 Mar 2026 13:28:05 +0000
Subject: [PATCH 7/9] Addressed comments under PR#188457

---
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp |  5 +++-
 mlir/test/Dialect/AMDGPU/invalid.mlir    | 36 +++++++++++++++---------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index c1dfa101f90c9..8c7038cbaaf92 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1309,7 +1309,10 @@ LogicalResult DsBarrierArriveOp::verify() {
 LogicalResult GlobalPrefetchOp::verify() {
   auto src = cast<MemRefType>(getSrc().getType());
 
-  if (!hasGlobalMemorySpace(src.getMemorySpace()))
+  Attribute memSpace = src.getMemorySpace();
+  if (!memSpace)
+    return this->emitOpError("the source must have address space attribute");
+  if (!hasGlobalMemorySpace(memSpace))
     return this->emitOpError("the source must reside in global address space");
 
   ArrayRef<int64_t> srcShape = src.getShape();
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 595c00c2c1f4c..86375aed9389b 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -663,54 +663,64 @@ func.func @sparse_wmma_i4_requires_equal_length_wave64(%a: vector<8xi4>, %b: vec
 
 // -----
 
-// GlobalPrefetchOp: source must reside in address space 1
-func.func @global_prefetch_wrong_address_space(%src: memref<64x64xf16>, %i: i64, %j: i64) {
-  // expected-error at +1 {{'amdgpu.global_prefetch' op the source must reside in address space `1`}}
+// GlobalPrefetchOp: source must have address space attribute
+func.func @global_prefetch_no_address_space(%src: memref<64x64xf16>, %i: i64, %j: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op the source must have address space attribute}}
   amdgpu.global_prefetch %src[%i, %j] RT : memref<64x64xf16>
   func.return
 }
 
+
+// -----
+
+// GlobalPrefetchOp: source must reside in global address space
+func.func @global_prefetch_wrong_address_space(%src: memref<64x64xf16, #gpu.address_space<workgroup>>, %i: i64, %j: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op the source must reside in global address space}}
+  amdgpu.global_prefetch %src[%i, %j] RT : memref<64x64xf16, #gpu.address_space<workgroup>>
+  func.return
+}
+
 // -----
 
 // GlobalPrefetchOp: number of indices must match source shape rank
-func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, 1>, %i: i64) {
+func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op the number of indices must match the source shape size}}
-  amdgpu.global_prefetch %src[%i] RT : memref<64x64xf16, 1>
+  amdgpu.global_prefetch %src[%i] RT : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
 // -----
 
 // GlobalPrefetchOp: NT temporal hint is not supported
-func.func @global_prefetch_nt_mode(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+func.func @global_prefetch_nt_mode(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op does not support NT mode}}
-  amdgpu.global_prefetch %src[%i, %j] NT : memref<64x64xf16, 1>
+  amdgpu.global_prefetch %src[%i, %j] NT : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
 // -----
 
 // GlobalPrefetchOp: NT_RT requires speculative mode
-func.func @global_prefetch_nt_rt_not_speculative(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+func.func @global_prefetch_nt_rt_not_speculative(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
-  amdgpu.global_prefetch %src[%i, %j] NT_RT : memref<64x64xf16, 1>
+  amdgpu.global_prefetch %src[%i, %j] NT_RT : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
 // -----
 
 // GlobalPrefetchOp: RT_NT requires speculative mode
-func.func @global_prefetch_rt_nt_not_speculative(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+func.func @global_prefetch_rt_nt_not_speculative(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
-  amdgpu.global_prefetch %src[%i, %j] RT_NT : memref<64x64xf16, 1>
+  amdgpu.global_prefetch %src[%i, %j] RT_NT : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
 // -----
 
 // GlobalPrefetchOp: NT_HT requires speculative mode
-func.func @global_prefetch_nt_ht_not_speculative(%src: memref<64x64xf16, 1>, %i: i64, %j: i64) {
+func.func @global_prefetch_nt_ht_not_speculative(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
-  amdgpu.global_prefetch %src[%i, %j] NT_HT : memref<64x64xf16, 1>
+  amdgpu.global_prefetch %src[%i, %j] NT_HT : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }

>From 484d159e821bbebc416894f68301576ea630fd51 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Thu, 26 Mar 2026 16:57:51 +0000
Subject: [PATCH 8/9] Added a scope attribute to prefetch op

---
 .../mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td       |  3 +++
 .../mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td       | 17 ++++++++++++++++-
 .../include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td |  8 +++++---
 .../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp  | 12 ++++++++----
 .../AMDGPUToROCDL/global-prefetch.mlir          | 12 ++++++++++--
 mlir/test/Dialect/AMDGPU/invalid.mlir           | 14 +++++++-------
 6 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
index 0e4ab8d5b6dc5..2f8d4a086c690 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
@@ -50,4 +50,7 @@ def AMDGPU_MFMAPermBAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_MFMAPermB,
 def AMDGPU_TemporalHintAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_TemporalHint,
   "temporal_hint">;
 
+def AMDGPU_CacheScopeAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_CacheScope,
+  "cache_scope">;
+
 #endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
index 68bae3d255447..303a11b30c79f 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
@@ -88,7 +88,7 @@ def AMDGPU_TemporalHint : I32Enum<"TemporalHint",
     "LU - last-use; "
     "NT_RT - non-temporal for near cache(s) and regular for far caches; "
     "RT_NT - regular for near cache(s) and non-temporal for far caches; "
-    "NT_HT - non-temporal for near cache(s) and high-priority temporal for far caches; ",
+    "NT_HT - non-temporal for near cache(s) and high-priority temporal for far caches",
     [
       I32EnumAttrCase<"RT",    0>,
       I32EnumAttrCase<"NT",    1>,
@@ -101,4 +101,19 @@ def AMDGPU_TemporalHint : I32Enum<"TemporalHint",
   let cppNamespace = "::mlir::amdgpu";
 }
 
+def AMDGPU_CacheScope : I32Enum<"Scope",
+    "AMDGPU-specific cache scopes. "
+    "WGP - workgroup processor (CUs); "
+    "SE - shader engine (GL2); "
+    "DEV - device; "
+    "SYS - system",
+    [
+      I32EnumAttrCase<"WGP",    0>,
+      I32EnumAttrCase<"SE",     1>,
+      I32EnumAttrCase<"DEV",    2>,
+      I32EnumAttrCase<"SYS",    3>
+    ]> {
+  let cppNamespace = "::mlir::amdgpu";
+}
+
 #endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUENUMS_TD
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 0e99a56a2fa09..0f01a46e147f5 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1957,6 +1957,7 @@ def AMDGPU_GlobalPrefetchOp :
     Arguments<(ins AnyMemRef:$src,
                Variadic<I64>:$indices,
                AMDGPU_TemporalHintAttr:$temporalHint,
+               AMDGPU_CacheScopeAttr:$cacheScope,
                UnitAttr:$speculative)>,
     Results<(outs)> {
 
@@ -1966,18 +1967,19 @@ def AMDGPU_GlobalPrefetchOp :
     the source `memref` and an offset provided by the indices of the element
     containing the cache line. This provides temporal hints (e.g., regular
     or high-priority). Note that out-of-bounds access is allowed in
-    speculative mode. The provided memref must be in the global address space (`#gpu.address_space<global>` or 1).
+    speculative mode. The provided memref must be in the global address space
+    (`#gpu.address_space<global>` or 1).
 
     This operation was introduced in gfx1250.
 
     Example:
     ```mlir
-    amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, 1>
+    amdgpu.global_prefetch %src[%i, %j] RT SE speculative : memref<64x64xf16, #gpu.address_space<global>>
     ```
   }];
 
   let assemblyFormat = [{
-    $src `[` $indices `]` $temporalHint (`speculative` $speculative^)? attr-dict `:` qualified(type($src))
+    $src `[` $indices `]` $temporalHint $cacheScope (`speculative` $speculative^)? attr-dict `:` qualified(type($src))
   }];
 
   let hasVerifier = 1;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 3c181fcb1e11d..2fa560a5cd826 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3964,16 +3964,20 @@ struct GlobalPrefetchOpLowering
     const TemporalHint hint = op.getTemporalHint();
     const bool isSpeculative = op.getSpeculative();
 
-    int32_t llvmScopeValue = static_cast<int32_t>(hint);
+    int32_t immArgValue = static_cast<int32_t>(hint);
 
     // Note that only RT and HT can operate in both speculative and
     // non-speculative modes. The other variants (NT_RT, RT_NT, NT_HT, etc.)
     // operate only in the speculative mode and, therefore, do not require
     // toggling the least significant bit for mode changes
+    // Temporal hint is encoded in lower bits - i.e. [2:0]
     if (llvm::is_contained({TemporalHint::RT, TemporalHint::HT}, hint))
-      llvmScopeValue = isSpeculative ? llvmScopeValue : llvmScopeValue | 1;
+      immArgValue = isSpeculative ? immArgValue : immArgValue | 1;
 
-    IntegerAttr scopeAttr = rewriter.getI32IntegerAttr(llvmScopeValue);
+    // Prefetch scope level is encoded in upper bits - i.e., [4:3]
+    immArgValue = static_cast<int32_t>(op.getCacheScope()) << 3 | immArgValue;
+
+    IntegerAttr immArgAttr = rewriter.getI32IntegerAttr(immArgValue);
 
     ValueRange indices = adaptor.getIndices();
     Value memRef = adaptor.getSrc();
@@ -3987,7 +3991,7 @@ struct GlobalPrefetchOpLowering
         rewriter, loc, memRefType, descriptor, indices, inboundsFlags);
 
     Operation *newOp = ROCDL::GlobalPrefetchOp::create(
-        rewriter, loc, prefetchPtr, scopeAttr, {}, {}, {});
+        rewriter, loc, prefetchPtr, immArgAttr, {}, {}, {});
 
     rewriter.replaceOp(op, newOp);
     return success();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
index d38285743ac5b..643b809d90675 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
@@ -4,7 +4,7 @@
 func.func @glb_prefetch0(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
   // CHECK: %[[PTR:.*]] = llvm.getelementptr %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
   // CHECK: rocdl.global.prefetch %{{.*}}, scope 0 : !llvm.ptr<1>
-  amdgpu.global_prefetch %src[%i, %j] RT speculative : memref<64x64xf16, #gpu.address_space<global>>
+  amdgpu.global_prefetch %src[%i, %j] RT WGP speculative : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
@@ -12,6 +12,14 @@ func.func @glb_prefetch0(%src : memref<64x64xf16, #gpu.address_space<global>>, %
 func.func @glb_prefetch1(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
   // CHECK: %[[PTR:.*]] = llvm.getelementptr inbounds|nuw %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
   // CHECK: rocdl.global.prefetch %[[PTR]], scope 3 : !llvm.ptr<1>
-  amdgpu.global_prefetch %src[%i, %j] HT : memref<64x64xf16, #gpu.address_space<global>>
+  amdgpu.global_prefetch %src[%i, %j] HT WGP : memref<64x64xf16, #gpu.address_space<global>>
+  func.return
+}
+
+// CHECK-LABEL: @glb_prefetch2
+func.func @glb_prefetch2(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
+  // CHECK: %[[PTR:.*]] = llvm.getelementptr %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
+  // CHECK: rocdl.global.prefetch %[[PTR]], scope 10 : !llvm.ptr<1>
+  amdgpu.global_prefetch %src[%i, %j] HT SE speculative : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 86375aed9389b..85919ac7e5db4 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -666,7 +666,7 @@ func.func @sparse_wmma_i4_requires_equal_length_wave64(%a: vector<8xi4>, %b: vec
 // GlobalPrefetchOp: source must have address space attribute
 func.func @global_prefetch_no_address_space(%src: memref<64x64xf16>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op the source must have address space attribute}}
-  amdgpu.global_prefetch %src[%i, %j] RT : memref<64x64xf16>
+  amdgpu.global_prefetch %src[%i, %j] RT WGP : memref<64x64xf16>
   func.return
 }
 
@@ -676,7 +676,7 @@ func.func @global_prefetch_no_address_space(%src: memref<64x64xf16>, %i: i64, %j
 // GlobalPrefetchOp: source must reside in global address space
 func.func @global_prefetch_wrong_address_space(%src: memref<64x64xf16, #gpu.address_space<workgroup>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op the source must reside in global address space}}
-  amdgpu.global_prefetch %src[%i, %j] RT : memref<64x64xf16, #gpu.address_space<workgroup>>
+  amdgpu.global_prefetch %src[%i, %j] RT SE : memref<64x64xf16, #gpu.address_space<workgroup>>
   func.return
 }
 
@@ -685,7 +685,7 @@ func.func @global_prefetch_wrong_address_space(%src: memref<64x64xf16, #gpu.addr
 // GlobalPrefetchOp: number of indices must match source shape rank
 func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op the number of indices must match the source shape size}}
-  amdgpu.global_prefetch %src[%i] RT : memref<64x64xf16, #gpu.address_space<global>>
+  amdgpu.global_prefetch %src[%i] RT DEV : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
@@ -694,7 +694,7 @@ func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, #gpu.addres
 // GlobalPrefetchOp: NT temporal hint is not supported
 func.func @global_prefetch_nt_mode(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op does not support NT mode}}
-  amdgpu.global_prefetch %src[%i, %j] NT : memref<64x64xf16, #gpu.address_space<global>>
+  amdgpu.global_prefetch %src[%i, %j] NT SYS : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
@@ -703,7 +703,7 @@ func.func @global_prefetch_nt_mode(%src: memref<64x64xf16, #gpu.address_space<gl
 // GlobalPrefetchOp: NT_RT requires speculative mode
 func.func @global_prefetch_nt_rt_not_speculative(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
-  amdgpu.global_prefetch %src[%i, %j] NT_RT : memref<64x64xf16, #gpu.address_space<global>>
+  amdgpu.global_prefetch %src[%i, %j] NT_RT WGP : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
@@ -712,7 +712,7 @@ func.func @global_prefetch_nt_rt_not_speculative(%src: memref<64x64xf16, #gpu.ad
 // GlobalPrefetchOp: RT_NT requires speculative mode
 func.func @global_prefetch_rt_nt_not_speculative(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
-  amdgpu.global_prefetch %src[%i, %j] RT_NT : memref<64x64xf16, #gpu.address_space<global>>
+  amdgpu.global_prefetch %src[%i, %j] RT_NT SE : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
@@ -721,6 +721,6 @@ func.func @global_prefetch_rt_nt_not_speculative(%src: memref<64x64xf16, #gpu.ad
 // GlobalPrefetchOp: NT_HT requires speculative mode
 func.func @global_prefetch_nt_ht_not_speculative(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}
-  amdgpu.global_prefetch %src[%i, %j] NT_HT : memref<64x64xf16, #gpu.address_space<global>>
+  amdgpu.global_prefetch %src[%i, %j] NT_HT DEV : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }

>From f5c576add8cca7c57ccd1d39f88ca503387706d3 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Fri, 27 Mar 2026 10:46:43 +0000
Subject: [PATCH 9/9] Addressed comments under PR#188457

---
 .../mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td     | 22 +++++++++----------
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           |  7 +++---
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp      | 11 ++++++++--
 mlir/test/Dialect/AMDGPU/invalid.mlir         | 11 +++++++++-
 4 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
index 303a11b30c79f..89432f20575d6 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
@@ -90,13 +90,13 @@ def AMDGPU_TemporalHint : I32Enum<"TemporalHint",
     "RT_NT - regular for near cache(s) and non-temporal for far caches; "
     "NT_HT - non-temporal for near cache(s) and high-priority temporal for far caches",
     [
-      I32EnumAttrCase<"RT",    0>,
-      I32EnumAttrCase<"NT",    1>,
-      I32EnumAttrCase<"HT",    2>,
-      I32EnumAttrCase<"LU",    3>,
-      I32EnumAttrCase<"NT_RT", 4>,
-      I32EnumAttrCase<"RT_NT", 5>,
-      I32EnumAttrCase<"NT_HT", 6>
+      I32EnumCase<"RT",    0>,
+      I32EnumCase<"NT",    1>,
+      I32EnumCase<"HT",    2>,
+      I32EnumCase<"LU",    3>,
+      I32EnumCase<"NT_RT", 4>,
+      I32EnumCase<"RT_NT", 5>,
+      I32EnumCase<"NT_HT", 6>
     ]> {
   let cppNamespace = "::mlir::amdgpu";
 }
@@ -108,10 +108,10 @@ def AMDGPU_CacheScope : I32Enum<"Scope",
     "DEV - device; "
     "SYS - system",
     [
-      I32EnumAttrCase<"WGP",    0>,
-      I32EnumAttrCase<"SE",     1>,
-      I32EnumAttrCase<"DEV",    2>,
-      I32EnumAttrCase<"SYS",    3>
+      I32EnumCase<"WGP",    0>,
+      I32EnumCase<"SE",     1>,
+      I32EnumCase<"DEV",    2>,
+      I32EnumCase<"SYS",    3>
     ]> {
   let cppNamespace = "::mlir::amdgpu";
 }
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 2fa560a5cd826..96d4e5da3388c 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3990,10 +3990,9 @@ struct GlobalPrefetchOpLowering
     Value prefetchPtr = getStridedElementPtr(
         rewriter, loc, memRefType, descriptor, indices, inboundsFlags);
 
-    Operation *newOp = ROCDL::GlobalPrefetchOp::create(
-        rewriter, loc, prefetchPtr, immArgAttr, {}, {}, {});
-
-    rewriter.replaceOp(op, newOp);
+    rewriter.replaceOpWithNewOp<ROCDL::GlobalPrefetchOp>(
+        op, prefetchPtr, immArgAttr, mlir::ArrayAttr{}, mlir::ArrayAttr{},
+        mlir::ArrayAttr{});
     return success();
   }
 
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index 8c7038cbaaf92..e27bd461908cd 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1323,8 +1323,15 @@ LogicalResult GlobalPrefetchOp::verify() {
 
   const TemporalHint temporalHint = getTemporalHint();
   const bool isSpeculative = getSpeculative();
-  if (temporalHint == TemporalHint::NT)
-    return this->emitOpError("does not support NT mode");
+
+  // Note that temporal hints are shared between load, store,
+  // prefetch, etc. instructions. However, some instructions
+  // operate only with a subset of hints according to the ISA
+  // documentation. In case of global prefetch, non-temporal (NT)
+  // and last-use (LU) hints are not used. The extra bits of encoding
+  // are used to encode speculative or non-speculative instruction behavior
+  if (llvm::is_contained({TemporalHint::NT, TemporalHint::LU}, temporalHint))
+    return this->emitOpError("does not support NT and LU modes");
 
   if (llvm::is_contained(
           {TemporalHint::NT_RT, TemporalHint::RT_NT, TemporalHint::NT_HT},
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 85919ac7e5db4..1eb64ddecf695 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -693,13 +693,22 @@ func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, #gpu.addres
 
 // GlobalPrefetchOp: NT temporal hint is not supported
 func.func @global_prefetch_nt_mode(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
-  // expected-error at +1 {{'amdgpu.global_prefetch' op does not support NT mode}}
+  // expected-error at +1 {{'amdgpu.global_prefetch' op does not support NT and LU modes}}
   amdgpu.global_prefetch %src[%i, %j] NT SYS : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
 // -----
 
+// GlobalPrefetchOp: LU temporal hint is not supported
+func.func @global_prefetch_lu_mode(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op does not support NT and LU modes}}
+  amdgpu.global_prefetch %src[%i, %j] LU DEV : memref<64x64xf16, #gpu.address_space<global>>
+  func.return
+}
+
+// -----
+
 // GlobalPrefetchOp: NT_RT requires speculative mode
 func.func @global_prefetch_nt_rt_not_speculative(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op operates only in the speculative mode}}