[Mlir-commits] [mlir] [AMDGPU][MLIR][NFC] moved enc computation to a dedicated method (PR #189339)

Thu Apr 2 03:06:24 PDT 2026

https://github.com/ravil-mobile updated https://github.com/llvm/llvm-project/pull/189339

>From d89c00a1c55c5f14783833c66b74a563273353d3 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Mon, 30 Mar 2026 09:02:50 +0000
Subject: [PATCH 1/3] [AMDGPU][MLIR][NFC] moved enc computation to a dedicated
 method

---
 .../include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td |  4 ++++
 .../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp  | 17 ++---------------
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp        | 17 +++++++++++++++++
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 0f01a46e147f5..5028f17bfa419 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1978,6 +1978,10 @@ def AMDGPU_GlobalPrefetchOp :
     ```
   }];
 
+  let extraClassDeclaration = [{
+    static int32_t getLLVMEncoding(amdgpu::TemporalHint hint, amdgpu::Scope scope, bool isSpeculative);
+  }];
+
   let assemblyFormat = [{
     $src `[` $indices `]` $temporalHint $cacheScope (`speculative` $speculative^)? attr-dict `:` qualified(type($src))
   }];
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 96d4e5da3388c..0135859404da1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3961,22 +3961,9 @@ struct GlobalPrefetchOpLowering
     if (chipset < kGfx1250)
       return op->emitOpError("is only supported on gfx1250+");
 
-    const TemporalHint hint = op.getTemporalHint();
     const bool isSpeculative = op.getSpeculative();
-
-    int32_t immArgValue = static_cast<int32_t>(hint);
-
-    // Note that only RT and HT can operate in both speculative and
-    // non-speculative modes. The other variants (NT_RT, RT_NT, NT_HT, etc.)
-    // operate only in the speculative mode and, therefore, do not require
-    // toggling the least significant bit for mode changes
-    // Temporal hint is encoded in lower bits - i.e. [2:0]
-    if (llvm::is_contained({TemporalHint::RT, TemporalHint::HT}, hint))
-      immArgValue = isSpeculative ? immArgValue : immArgValue | 1;
-
-    // Prefetch scope level is encoded in upper bits - i.e., [4:3]
-    immArgValue = static_cast<int32_t>(op.getCacheScope()) << 3 | immArgValue;
-
+    const int32_t immArgValue = GlobalPrefetchOp::getLLVMEncoding(
+        op.getTemporalHint(), op.getCacheScope(), isSpeculative);
     IntegerAttr immArgAttr = rewriter.getI32IntegerAttr(immArgValue);
 
     ValueRange indices = adaptor.getIndices();
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index e27bd461908cd..9d5850a46e661 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1306,6 +1306,23 @@ LogicalResult DsBarrierArriveOp::verify() {
 // GlobalPrefetchOp
 //===----------------------------------------------------------------------===//
 
+int32_t GlobalPrefetchOp::getLLVMEncoding(amdgpu::TemporalHint hint,
+                                          amdgpu::Scope scope,
+                                          bool isSpeculative) {
+  int32_t immArg = static_cast<int32_t>(hint);
+
+  // Note that only RT and HT can operate in both speculative and
+  // non-speculative modes. The other variants (NT_RT, RT_NT, NT_HT, etc.)
+  // operate only in the speculative mode and, therefore, do not require
+  // toggling the least significant bit for mode changes
+  // Temporal hint is encoded in lower bits - i.e. [2:0]
+  if (llvm::is_contained({TemporalHint::RT, TemporalHint::HT}, hint))
+    immArg = isSpeculative ? immArg : immArg | 1;
+
+  // Prefetch scope level is encoded in upper bits - i.e., [4:3]
+  return static_cast<int32_t>(scope) << 3 | immArg;
+}
+
 LogicalResult GlobalPrefetchOp::verify() {
   auto src = cast<MemRefType>(getSrc().getType());
 

>From 68ac20d372c3ce0063970158e7ba9dfbfc84f3f6 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Mon, 30 Mar 2026 13:39:32 +0000
Subject: [PATCH 2/3] Renamed TemporalHint to LoadTemporalHint

---
 .../include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td |  4 ++--
 .../include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td |  4 ++--
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td  |  4 ++--
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp          | 15 ++++++++-------
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
index 2f8d4a086c690..51e1267571243 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUAttrs.td
@@ -47,8 +47,8 @@ def AMDGPU_SchedBarrierOpOptAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_SchedBarrierO
 def AMDGPU_MFMAPermBAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_MFMAPermB,
   "mfma_perm_b">;
 
-def AMDGPU_TemporalHintAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_TemporalHint,
-  "temporal_hint">;
+def AMDGPU_LoadTemporalHintAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_LoadTemporalHint,
+  "load_temporal_hint">;
 
 def AMDGPU_CacheScopeAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_CacheScope,
   "cache_scope">;
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
index 89432f20575d6..fe4723b635dc6 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.td
@@ -80,8 +80,8 @@ def AMDGPU_MFMAPermB : I32Enum<"MFMAPermB",
   let cppNamespace = "::mlir::amdgpu";
 }
 
-def AMDGPU_TemporalHint : I32Enum<"TemporalHint",
-    "AMDGPU-specific prefetch temporal hints. "
+def AMDGPU_LoadTemporalHint : I32Enum<"LoadTemporalHint",
+    "AMDGPU-specific prefetch temporal hints for load instructions. "
     "RT - regular temporal for both near and far caches; "
     "NT - non-temporal for both near and far caches; "
     "HT - high-priority temporal for both near and far caches; "
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 5028f17bfa419..ff5fea4a67ad6 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1956,7 +1956,7 @@ def AMDGPU_GlobalPrefetchOp :
     AMDGPU_Op<"global_prefetch", [MemoryEffects<[MemWrite, MemRead]>]>,
     Arguments<(ins AnyMemRef:$src,
                Variadic<I64>:$indices,
-               AMDGPU_TemporalHintAttr:$temporalHint,
+               AMDGPU_LoadTemporalHintAttr:$temporalHint,
                AMDGPU_CacheScopeAttr:$cacheScope,
                UnitAttr:$speculative)>,
     Results<(outs)> {
@@ -1979,7 +1979,7 @@ def AMDGPU_GlobalPrefetchOp :
   }];
 
   let extraClassDeclaration = [{
-    static int32_t getLLVMEncoding(amdgpu::TemporalHint hint, amdgpu::Scope scope, bool isSpeculative);
+    static int32_t getLLVMEncoding(amdgpu::LoadTemporalHint hint, amdgpu::Scope scope, bool isSpeculative);
   }];
 
   let assemblyFormat = [{
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index 9d5850a46e661..e2df0aa41bebc 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1306,7 +1306,7 @@ LogicalResult DsBarrierArriveOp::verify() {
 // GlobalPrefetchOp
 //===----------------------------------------------------------------------===//
 
-int32_t GlobalPrefetchOp::getLLVMEncoding(amdgpu::TemporalHint hint,
+int32_t GlobalPrefetchOp::getLLVMEncoding(amdgpu::LoadTemporalHint hint,
                                           amdgpu::Scope scope,
                                           bool isSpeculative) {
   int32_t immArg = static_cast<int32_t>(hint);
@@ -1316,7 +1316,7 @@ int32_t GlobalPrefetchOp::getLLVMEncoding(amdgpu::TemporalHint hint,
   // operate only in the speculative mode and, therefore, do not require
   // toggling the least significant bit for mode changes
   // Temporal hint is encoded in lower bits - i.e. [2:0]
-  if (llvm::is_contained({TemporalHint::RT, TemporalHint::HT}, hint))
+  if (llvm::is_contained({LoadTemporalHint::RT, LoadTemporalHint::HT}, hint))
     immArg = isSpeculative ? immArg : immArg | 1;
 
   // Prefetch scope level is encoded in upper bits - i.e., [4:3]
@@ -1338,7 +1338,7 @@ LogicalResult GlobalPrefetchOp::verify() {
     return this->emitOpError(
         "the number of indices must match the source shape size");
 
-  const TemporalHint temporalHint = getTemporalHint();
+  const LoadTemporalHint temporalHint = getTemporalHint();
   const bool isSpeculative = getSpeculative();
 
   // Note that temporal hints are shared between load, store,
@@ -1347,12 +1347,13 @@ LogicalResult GlobalPrefetchOp::verify() {
   // documentation. In case of global prefetch, non-temporal (NT)
   // and last-use (LU) hints are not used. The extra bits of encoding
   // are used to encode speculative or non-speculative instruction behavior
-  if (llvm::is_contained({TemporalHint::NT, TemporalHint::LU}, temporalHint))
+  if (llvm::is_contained({LoadTemporalHint::NT, LoadTemporalHint::LU},
+                         temporalHint))
     return this->emitOpError("does not support NT and LU modes");
 
-  if (llvm::is_contained(
-          {TemporalHint::NT_RT, TemporalHint::RT_NT, TemporalHint::NT_HT},
-          temporalHint) &&
+  if (llvm::is_contained({LoadTemporalHint::NT_RT, LoadTemporalHint::RT_NT,
+                          LoadTemporalHint::NT_HT},
+                         temporalHint) &&
       !isSpeculative) {
     return this->emitOpError("operates only in the speculative mode");
   }

>From 4a828e22b689304f6efd5a74aee0223d47737452 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com at gmail.com>
Date: Thu, 2 Apr 2026 10:04:29 +0000
Subject: [PATCH 3/3] Addressed comments under #189339

---
 .../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td       |  4 ----
 .../mlir/Dialect/AMDGPU/Utils/Chipset.h       | 19 +++++++++++++++
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           |  2 +-
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp      | 23 +++++--------------
 .../AMDGPUToROCDL/global-prefetch.mlir        | 16 ++++++-------
 mlir/test/Dialect/AMDGPU/invalid.mlir         |  9 ++++++++
 6 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index ff5fea4a67ad6..d90659e31429d 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1978,10 +1978,6 @@ def AMDGPU_GlobalPrefetchOp :
     ```
   }];
 
-  let extraClassDeclaration = [{
-    static int32_t getLLVMEncoding(amdgpu::LoadTemporalHint hint, amdgpu::Scope scope, bool isSpeculative);
-  }];
-
   let assemblyFormat = [{
     $src `[` $indices `]` $temporalHint $cacheScope (`speculative` $speculative^)? attr-dict `:` qualified(type($src))
   }];
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
index ca9809799588c..ffd3325f500ee 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
@@ -8,7 +8,9 @@
 #ifndef MLIR_DIALECT_AMDGPU_UTILS_CHIPSET_H_
 #define MLIR_DIALECT_AMDGPU_UTILS_CHIPSET_H_
 
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
 #include <tuple>
 
 namespace mlir::amdgpu {
@@ -54,6 +56,23 @@ inline bool hasOcpFp8(const Chipset &chipset) {
          chipset.majorVersion >= 12;
 }
 
+inline int32_t getGlobalPrefetchLLVMEncoding(amdgpu::LoadTemporalHint hint,
+                                             amdgpu::Scope scope,
+                                             bool isSpeculative) {
+  int32_t immArg = static_cast<int32_t>(hint);
+
+  // Note that only RT and HT can operate in both speculative and
+  // non-speculative modes. The other variants (NT_RT, RT_NT, NT_HT, etc.)
+  // operate only in the speculative mode and, therefore, do not require
+  // toggling the least significant bit for mode changes
+  // Temporal hint is encoded in lower bits - i.e. [2:0]
+  if (llvm::is_contained({LoadTemporalHint::RT, LoadTemporalHint::HT}, hint))
+    immArg = isSpeculative ? immArg : immArg | 1;
+
+  // Prefetch scope level is encoded in upper bits - i.e., [4:3]
+  return static_cast<int32_t>(scope) << 3 | immArg;
+}
+
 } // namespace mlir::amdgpu
 
 #endif
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 0135859404da1..bd32715dba762 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3962,7 +3962,7 @@ struct GlobalPrefetchOpLowering
       return op->emitOpError("is only supported on gfx1250+");
 
     const bool isSpeculative = op.getSpeculative();
-    const int32_t immArgValue = GlobalPrefetchOp::getLLVMEncoding(
+    const int32_t immArgValue = getGlobalPrefetchLLVMEncoding(
         op.getTemporalHint(), op.getCacheScope(), isSpeculative);
     IntegerAttr immArgAttr = rewriter.getI32IntegerAttr(immArgValue);
 
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index e2df0aa41bebc..5bfa0f47140a3 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -1306,23 +1306,6 @@ LogicalResult DsBarrierArriveOp::verify() {
 // GlobalPrefetchOp
 //===----------------------------------------------------------------------===//
 
-int32_t GlobalPrefetchOp::getLLVMEncoding(amdgpu::LoadTemporalHint hint,
-                                          amdgpu::Scope scope,
-                                          bool isSpeculative) {
-  int32_t immArg = static_cast<int32_t>(hint);
-
-  // Note that only RT and HT can operate in both speculative and
-  // non-speculative modes. The other variants (NT_RT, RT_NT, NT_HT, etc.)
-  // operate only in the speculative mode and, therefore, do not require
-  // toggling the least significant bit for mode changes
-  // Temporal hint is encoded in lower bits - i.e. [2:0]
-  if (llvm::is_contained({LoadTemporalHint::RT, LoadTemporalHint::HT}, hint))
-    immArg = isSpeculative ? immArg : immArg | 1;
-
-  // Prefetch scope level is encoded in upper bits - i.e., [4:3]
-  return static_cast<int32_t>(scope) << 3 | immArg;
-}
-
 LogicalResult GlobalPrefetchOp::verify() {
   auto src = cast<MemRefType>(getSrc().getType());
 
@@ -1339,8 +1322,14 @@ LogicalResult GlobalPrefetchOp::verify() {
         "the number of indices must match the source shape size");
 
   const LoadTemporalHint temporalHint = getTemporalHint();
+  const Scope scope = getCacheScope();
   const bool isSpeculative = getSpeculative();
 
+  // See GFX1250 SPG for a detail explanation
+  if (isSpeculative && scope == Scope::WGP)
+    return this->emitOpError(
+        "does not support speculative prefetch in WGP scope");
+
   // Note that temporal hints are shared between load, store,
   // prefetch, etc. instructions. However, some instructions
   // operate only with a subset of hints according to the ISA
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
index 643b809d90675..acd3710a485ac 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/global-prefetch.mlir
@@ -2,24 +2,24 @@
 
 // CHECK-LABEL: @glb_prefetch0
 func.func @glb_prefetch0(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
-  // CHECK: %[[PTR:.*]] = llvm.getelementptr %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
-  // CHECK: rocdl.global.prefetch %{{.*}}, scope 0 : !llvm.ptr<1>
-  amdgpu.global_prefetch %src[%i, %j] RT WGP speculative : memref<64x64xf16, #gpu.address_space<global>>
+  // CHECK: %[[PTR:.*]] = llvm.getelementptr inbounds|nuw %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
+  // CHECK: rocdl.global.prefetch %[[PTR]], scope 3 : !llvm.ptr<1>
+  amdgpu.global_prefetch %src[%i, %j] HT WGP : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
 // CHECK-LABEL: @glb_prefetch1
 func.func @glb_prefetch1(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
-  // CHECK: %[[PTR:.*]] = llvm.getelementptr inbounds|nuw %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
-  // CHECK: rocdl.global.prefetch %[[PTR]], scope 3 : !llvm.ptr<1>
-  amdgpu.global_prefetch %src[%i, %j] HT WGP : memref<64x64xf16, #gpu.address_space<global>>
+  // CHECK: %[[PTR:.*]] = llvm.getelementptr %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
+  // CHECK: rocdl.global.prefetch %[[PTR]], scope 10 : !llvm.ptr<1>
+  amdgpu.global_prefetch %src[%i, %j] HT SE speculative : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
 
 // CHECK-LABEL: @glb_prefetch2
 func.func @glb_prefetch2(%src : memref<64x64xf16, #gpu.address_space<global>>, %i : i64, %j : i64) {
   // CHECK: %[[PTR:.*]] = llvm.getelementptr %{{.*}}[%{{.*}}] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f16
-  // CHECK: rocdl.global.prefetch %[[PTR]], scope 10 : !llvm.ptr<1>
-  amdgpu.global_prefetch %src[%i, %j] HT SE speculative : memref<64x64xf16, #gpu.address_space<global>>
+  // CHECK: rocdl.global.prefetch %{{.*}}, scope 16 : !llvm.ptr<1>
+  amdgpu.global_prefetch %src[%i, %j] RT DEV speculative : memref<64x64xf16, #gpu.address_space<global>>
   func.return
 }
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 1eb64ddecf695..87f61ca43e4fc 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -682,6 +682,15 @@ func.func @global_prefetch_wrong_address_space(%src: memref<64x64xf16, #gpu.addr
 
 // -----
 
+// GlobalPrefetchOp: WG scope operates only in the non-speculative mode
+func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64, %j: i64) {
+  // expected-error at +1 {{'amdgpu.global_prefetch' op does not support speculative prefetch in WGP scope}}
+  amdgpu.global_prefetch %src[%i, %j] RT WGP speculative : memref<64x64xf16, #gpu.address_space<global>>
+  func.return
+}
+
+// -----
+
 // GlobalPrefetchOp: number of indices must match source shape rank
 func.func @global_prefetch_wrong_num_indices(%src: memref<64x64xf16, #gpu.address_space<global>>, %i: i64) {
   // expected-error at +1 {{'amdgpu.global_prefetch' op the number of indices must match the source shape size}}