[Mlir-commits] [mlir] [MLIR][NVVM] Add prefetch Ops (PR #141737)
Srinivasa Ravi
llvmlistbot at llvm.org
Wed Jun 4 04:18:14 PDT 2025
https://github.com/Wolfram70 updated https://github.com/llvm/llvm-project/pull/141737
>From 532e0d949a2b91399d00a455b5c3e6aee1eb8a41 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Tue, 27 May 2025 15:35:05 +0530
Subject: [PATCH 1/4] [MLIR][NVVM] Add prefetch Ops
This change adds `prefetch.L1`, `prefetch.L2`, and `prefetch.L1.uniform`
Ops to the NVVM dialect for the `prefetch` and `prefetchu` group of
instructions.
PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu
---
.../include/mlir/Dialect/LLVMIR/NVVMDialect.h | 4 +
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 89 +++++++++++++++++++
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 71 +++++++++++++++
mlir/test/Dialect/LLVMIR/nvvm.mlir | 23 +++++
mlir/test/Target/LLVMIR/nvvm/prefetch.mlir | 47 ++++++++++
mlir/test/Target/LLVMIR/nvvmir-invalid.mlir | 56 ++++++++++++
6 files changed, 290 insertions(+)
create mode 100644 mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index fc38a3fb2d387..6137bb087c576 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -36,12 +36,16 @@ constexpr int kSharedMemoryAlignmentBit = 128;
/// NVVM memory space identifiers.
enum NVVMMemorySpace {
+ /// Generic memory space identifier.
+ kGenericMemorySpace = 0,
/// Global memory space identifier.
kGlobalMemorySpace = 1,
/// Shared memory space identifier.
kSharedMemorySpace = 3,
/// Constant memory space identifier.
kConstantMemorySpace = 4,
+ /// Local memory space identifier.
+ kLocalMemorySpace = 5,
/// Tensor memory space identifier.
/// Tensor memory is available only in arch-accelerated
/// variants from sm100 onwards.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 596a584d485ed..7a89316490280 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -25,6 +25,7 @@ include "mlir/Dialect/LLVMIR/LLVMTypes.td"
def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
+def LLVM_PointerLocal : LLVM_PointerInAddressSpace<5>;
def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>;
def LLVM_PointerSharedCluster : LLVM_PointerInAddressSpace<7>;
@@ -118,6 +119,25 @@ class NVVM_Attr<string attrName, string attrMnemonic, list<Trait> traits = []>
let mnemonic = attrMnemonic;
}
+// Cache Eviction Priority enum definitions
+def EvictNormal : I32EnumCase<"EvictNormal", 0, "evict_normal">;
+def EvictFirst : I32EnumCase<"EvictFirst", 1, "evict_first">;
+def EvictLast : I32EnumCase<"EvictLast", 2, "evict_last">;
+def EvictUnchanged : I32EnumCase<"EvictUnchanged", 3, "evict_unchanged">;
+def NoAllocate : I32EnumCase<"NoAllocate", 4, "no_allocate">;
+
+def CacheEvictionPriority : I32Enum<"CacheEvictionPriority",
+ "NVVM Cache Eviction Priority",
+ [EvictNormal, EvictFirst, EvictLast,
+ EvictUnchanged, NoAllocate]> {
+ let cppNamespace = "::mlir::NVVM";
+}
+
+def CacheEvictionPriorityAttr : EnumAttr<NVVM_Dialect, CacheEvictionPriority,
+ "cache_eviction_priority"> {
+ let assemblyFormat = "$value";
+}
+
//===----------------------------------------------------------------------===//
// NVVM intrinsic operations
//===----------------------------------------------------------------------===//
@@ -2333,6 +2353,75 @@ def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp :
let hasVerifier = 1;
}
+//===----------------------------------------------------------------------===//
+// NVVM Prefetch Ops
+//===----------------------------------------------------------------------===//
+
+def PrefetchCacheLevelL1 : I32EnumCase<"L1", 0, "L1">;
+def PrefetchCacheLevelL2 : I32EnumCase<"L2", 1, "L2">;
+
+def PrefetchCacheLevel : I32Enum<"PrefetchCacheLevel",
+ "NVVM Prefetch Cache Level",
+ [PrefetchCacheLevelL1, PrefetchCacheLevelL2]> {
+ let cppNamespace = "::mlir::NVVM";
+}
+
+def PrefetchCacheLevelAttr : EnumAttr<NVVM_Dialect, PrefetchCacheLevel, "prefetch_cache_level"> {
+ let assemblyFormat = "$value";
+}
+
+def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
+ let summary = "Brings the cache line containing an address into the specified cache level";
+ let description = [{
+ Operand `addr` can be a global, local or generic address pointer. No
+ operation is performed if `addr` maps to a `shared` memory location.
+
+ The `cacheLevel` attribute specifies the cache level to which the cache line
+ containing the specified address is brought.
+
+ The `evictPriority` attribute is optional and specifies the cache eviction
+ priority when `cacheLevel` is L2.
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu)
+ }];
+ let arguments = (ins PrefetchCacheLevelAttr:$cacheLevel,
+ AnyTypeOf<[LLVM_PointerGlobal,
+ LLVM_PointerLocal,
+ LLVM_PointerGeneric]>:$addr,
+ OptionalAttr<CacheEvictionPriorityAttr>:$evictPriority);
+ let assemblyFormat = "`level` `=` $cacheLevel `,` $addr (`,` `evict_priority` `=` $evictPriority^)? attr-dict `:` type($addr)";
+ let hasVerifier = 1;
+
+ let extraClassDeclaration = [{
+ static llvm::Intrinsic::ID getIntrinsicID(Operation &op);
+ }];
+ let llvmBuilder = [{
+ auto intId = NVVM::PrefetchOp::getIntrinsicID(*op);
+ createIntrinsicCall(builder, intId, $addr);
+ }];
+}
+
+def NVVM_PrefetchUniformOp : NVVM_Op<"prefetch.uniform"> {
+ let summary = "Brings the cache line containing an address into the specified uniform cache level";
+ let description = [{
+ Operand `addr` must be a generic address pointer and no operation is
+ performed if `addr` maps to a `const`, `local`, or `shared` memory location.
+
+ The `cacheLevel` attribute specifies the cache level to which the cache line
+ containing the specified address is brought. The only supported level is L1.
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu)
+ }];
+ let arguments = (ins PrefetchCacheLevelAttr:$cacheLevel,
+ LLVM_PointerGeneric:$addr);
+ let assemblyFormat = "`level` `=` $cacheLevel `,` $addr attr-dict `:` type($addr)";
+ let hasVerifier = 1;
+
+ let llvmBuilder = [{
+ createIntrinsicCall(builder, llvm::Intrinsic::nvvm_prefetchu_L1, $addr);
+ }];
+}
+
def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
[DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> {
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index a77ff1e32dc23..6b085892e0b59 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1205,6 +1205,35 @@ LogicalResult NVVM::VoteSyncOp::verify() {
return success();
}
+LogicalResult NVVM::PrefetchOp::verify() {
+ auto evictPriority = getEvictPriority();
+
+ if (evictPriority && getCacheLevel() != NVVM::PrefetchCacheLevel::L2)
+ return emitOpError(
+ "cache eviction priority supported only for cache level L2");
+
+ if (evictPriority &&
+ (llvm::cast<LLVM::LLVMPointerType>(getAddr().getType())
+ .getAddressSpace() != NVVM::NVVMMemorySpace::kGlobalMemorySpace))
+ return emitOpError("cache eviction priority requires a global pointer");
+
+ if (evictPriority &&
+ *evictPriority != NVVM::CacheEvictionPriority::EvictNormal &&
+ *evictPriority != NVVM::CacheEvictionPriority::EvictLast)
+ return emitOpError(
+ "unsupported cache eviction priority, only evict_last and "
+ "evict_normal are supported");
+
+ return success();
+}
+
+LogicalResult NVVM::PrefetchUniformOp::verify() {
+ if (getCacheLevel() != NVVM::PrefetchCacheLevel::L1)
+ return emitOpError(
+ "unsupported cache level, the only supported level is L1");
+ return success();
+}
+
/// Packs the given `field` into the `result`.
/// The `result` is 64-bits and each `field` can be 32-bits or narrower.
static llvm::Value *
@@ -1734,6 +1763,48 @@ NVVM::IDArgPair DotAccumulate2WayOp::getIntrinsicIDAndArgs(
return {ids[type], args};
}
+llvm::Intrinsic::ID PrefetchOp::getIntrinsicID(Operation &op) {
+ auto curOp = llvm::cast<NVVM::PrefetchOp>(op);
+ NVVM::PrefetchCacheLevel cacheLevel = curOp.getCacheLevel();
+ std::optional<NVVM::CacheEvictionPriority> evictPriority =
+ curOp.getEvictPriority();
+ unsigned as = llvm::cast<LLVM::LLVMPointerType>(curOp.getAddr().getType())
+ .getAddressSpace();
+
+ if (cacheLevel == NVVM::PrefetchCacheLevel::L1) {
+ switch (as) {
+ case NVVM::NVVMMemorySpace::kGenericMemorySpace:
+ return llvm::Intrinsic::nvvm_prefetch_L1;
+ case NVVM::NVVMMemorySpace::kGlobalMemorySpace:
+ return llvm::Intrinsic::nvvm_prefetch_global_L1;
+ case NVVM::NVVMMemorySpace::kLocalMemorySpace:
+ return llvm::Intrinsic::nvvm_prefetch_local_L1;
+ default:
+ llvm_unreachable("Invalid pointer address space");
+ }
+ } else if (cacheLevel == NVVM::PrefetchCacheLevel::L2) {
+ switch (as) {
+ case NVVM::NVVMMemorySpace::kGenericMemorySpace:
+ return llvm::Intrinsic::nvvm_prefetch_L2;
+ case NVVM::NVVMMemorySpace::kGlobalMemorySpace:
+ if (evictPriority) {
+ if (*evictPriority == NVVM::CacheEvictionPriority::EvictLast)
+ return llvm::Intrinsic::nvvm_prefetch_global_L2_evict_last;
+ else if (*evictPriority == NVVM::CacheEvictionPriority::EvictNormal)
+ return llvm::Intrinsic::nvvm_prefetch_global_L2_evict_normal;
+ else
+ llvm_unreachable("Invalid cache eviction priority");
+ }
+ return llvm::Intrinsic::nvvm_prefetch_global_L2;
+ case NVVM::NVVMMemorySpace::kLocalMemorySpace:
+ return llvm::Intrinsic::nvvm_prefetch_local_L2;
+ default:
+ llvm_unreachable("Invalid pointer address space");
+ }
+ }
+ llvm_unreachable("Invalid cache level");
+}
+
//===----------------------------------------------------------------------===//
// NVVMDialect initialization, type parsing, and registration.
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index a02d33f50e0d2..8184ec05ffc58 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -596,6 +596,29 @@ func.func @dot_accumulate_2way(%a_vec: vector<2xi16>, %b_vec: vector<4xi8>, %c:
return
}
+// CHECK-LABEL: @prefetch
+func.func @prefetch(%gen_ptr: !llvm.ptr, %local_ptr: !llvm.ptr<5>, %global_ptr: !llvm.ptr<1>) {
+ // CHECK: nvvm.prefetch level = L1, %{{.*}}
+ nvvm.prefetch level = L1, %gen_ptr : !llvm.ptr<0>
+ // CHECK: nvvm.prefetch level = L1, %{{.*}}
+ nvvm.prefetch level = L1, %local_ptr : !llvm.ptr<5>
+ // CHECK: nvvm.prefetch level = L1, %{{.*}}
+ nvvm.prefetch level = L1, %global_ptr : !llvm.ptr<1>
+ // CHECK: nvvm.prefetch level = L2, %{{.*}}
+ nvvm.prefetch level = L2, %gen_ptr : !llvm.ptr<0>
+ // CHECK: nvvm.prefetch level = L2, %{{.*}}
+ nvvm.prefetch level = L2, %local_ptr : !llvm.ptr<5>
+ // CHECK: nvvm.prefetch level = L2, %{{.*}}
+ nvvm.prefetch level = L2, %global_ptr : !llvm.ptr<1>
+ // CHECK: nvvm.prefetch level = L2, %{{.*}}
+ nvvm.prefetch level = L2, %global_ptr, evict_priority = evict_last : !llvm.ptr<1>
+ // CHECK: nvvm.prefetch level = L2, %{{.*}}
+ nvvm.prefetch level = L2, %global_ptr, evict_priority = evict_normal : !llvm.ptr<1>
+ // CHECK: nvvm.prefetch.uniform level = L1, %{{.*}}
+ nvvm.prefetch.uniform level = L1, %gen_ptr : !llvm.ptr
+ return
+}
+
// -----
// Just check these don't emit errors.
diff --git a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
new file mode 100644
index 0000000000000..b362d26f82e25
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
@@ -0,0 +1,47 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @prefetch_L1(%gen_ptr: !llvm.ptr, %local_ptr: !llvm.ptr<5>, %global_ptr: !llvm.ptr<1>) {
+ // CHECK-LABEL: define void @prefetch_L1(ptr %0, ptr addrspace(5) %1, ptr addrspace(1) %2) {
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.L1(ptr %0)
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.local.L1(ptr addrspace(5) %1)
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.global.L1(ptr addrspace(1) %2)
+ // CHECK-NEXT: ret void
+ // CHECK-NEXT: }
+ nvvm.prefetch level = L1, %gen_ptr : !llvm.ptr<0>
+ nvvm.prefetch level = L1, %local_ptr : !llvm.ptr<5>
+ nvvm.prefetch level = L1, %global_ptr : !llvm.ptr<1>
+ llvm.return
+}
+
+llvm.func @prefetch_L2(%gen_ptr: !llvm.ptr, %local_ptr: !llvm.ptr<5>, %global_ptr: !llvm.ptr<1>) {
+ // CHECK-LABEL: define void @prefetch_L2(ptr %0, ptr addrspace(5) %1, ptr addrspace(1) %2) {
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.L2(ptr %0)
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.local.L2(ptr addrspace(5) %1)
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.global.L2(ptr addrspace(1) %2)
+ // CHECK-NEXT: ret void
+ // CHECK-NEXT: }
+ nvvm.prefetch level = L2, %gen_ptr : !llvm.ptr<0>
+ nvvm.prefetch level = L2, %local_ptr : !llvm.ptr<5>
+ nvvm.prefetch level = L2, %global_ptr : !llvm.ptr<1>
+ llvm.return
+}
+
+llvm.func @prefetch_L2_eviction_priority(%global_ptr: !llvm.ptr<1>) {
+ // CHECK-LABEL: define void @prefetch_L2_eviction_priority(ptr addrspace(1) %0) {
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %0)
+ // CHECK-NEXT: call void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %0)
+ // CHECK-NEXT: ret void
+ // CHECK-NEXT: }
+ nvvm.prefetch level = L2, %global_ptr, evict_priority = evict_last : !llvm.ptr<1>
+ nvvm.prefetch level = L2, %global_ptr, evict_priority = evict_normal : !llvm.ptr<1>
+ llvm.return
+}
+
+llvm.func @prefetch_L1_uniform(%gen_ptr: !llvm.ptr) {
+ // CHECK-LABEL: define void @prefetch_L1_uniform(ptr %0) {
+ // CHECK-NEXT: call void @llvm.nvvm.prefetchu.L1(ptr %0)
+ // CHECK-NEXT: ret void
+ // CHECK-NEXT: }
+ nvvm.prefetch.uniform level = L1, %gen_ptr : !llvm.ptr
+ llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 3d63434f310bd..6b8ba3a31bb1c 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -248,3 +248,59 @@ llvm.func @nvvm_cvt_bf16x2_to_f8x2_invalid_rounding(%src : vector<2xbf16>) {
%res = nvvm.convert.bf16x2.to.f8x2 <ue8m0> %src {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xbf16> -> i16
llvm.return
}
+
+// -----
+
+llvm.func @nvvm_prefetch_L1_with_evict_priority(%global_ptr: !llvm.ptr<1>) {
+ // expected-error @below {{cache eviction priority supported only for cache level L2}}
+ nvvm.prefetch level = L1, %global_ptr, evict_priority = evict_last : !llvm.ptr<1>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_prefetch_L2_with_evict_last_invalid_addr_space(%local_ptr: !llvm.ptr<5>) {
+ // expected-error @below {{cache eviction priority requires a global pointer}}
+ nvvm.prefetch level = L2, %local_ptr, evict_priority = evict_last : !llvm.ptr<5>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_prefetch_L2_with_evict_normal_invalid_addr_space(%local_ptr: !llvm.ptr<5>) {
+ // expected-error @below {{cache eviction priority requires a global pointer}}
+ nvvm.prefetch level = L2, %local_ptr, evict_priority = evict_normal : !llvm.ptr<5>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_prefetch_L2_with_invalid_evict_first(%global_ptr: !llvm.ptr<1>) {
+ // expected-error @below {{unsupported cache eviction priority, only evict_last and evict_normal are supported}}
+ nvvm.prefetch level = L2, %global_ptr, evict_priority = evict_first : !llvm.ptr<1>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_prefetch_L2_with_invalid_evict_unchanged(%global_ptr: !llvm.ptr<1>) {
+ // expected-error @below {{unsupported cache eviction priority, only evict_last and evict_normal are supported}}
+ nvvm.prefetch level = L2, %global_ptr, evict_priority = evict_unchanged : !llvm.ptr<1>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_prefetch_L2_with_invalid_no_allocate(%global_ptr: !llvm.ptr<1>) {
+ // expected-error @below {{unsupported cache eviction priority, only evict_last and evict_normal are supported}}
+ nvvm.prefetch level = L2, %global_ptr, evict_priority = no_allocate : !llvm.ptr<1>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_prefetch_uniform_with_L2(%gen_ptr: !llvm.ptr) {
+ // expected-error @below {{unsupported cache level, the only supported level is L1}}
+ nvvm.prefetch.uniform level = L2, %gen_ptr : !llvm.ptr
+ llvm.return
+}
>From 2c37270d66b1376c6979e1a98cad9df4e9ad02d3 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Tue, 3 Jun 2025 23:02:18 +0530
Subject: [PATCH 2/4] merge prefetch.uniform into prefetch
---
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 27 ++++---------------
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 30 ++++++++++++++-------
mlir/test/Dialect/LLVMIR/nvvm.mlir | 4 +--
mlir/test/Target/LLVMIR/nvvm/prefetch.mlir | 2 +-
mlir/test/Target/LLVMIR/nvvmir-invalid.mlir | 12 +++++++--
5 files changed, 39 insertions(+), 36 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 7a89316490280..151174a9d2add 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2378,6 +2378,9 @@ def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
The `cacheLevel` attribute specifies the cache level to which the cache line
containing the specified address is brought.
+
+ `uniform` can be specified after the `cacheLevel` to indicate that the
+ prefetch is performed to the specified uniform cache level. If `uniform` is specified, `addr` must be a generic address pointer and no operation is performed if `addr` maps to a `const`, `local`, or `shared` memory location.
The `evictPriority` attribute is optional and specifies the cache eviction
priority when `cacheLevel` is L2.
@@ -2385,11 +2388,12 @@ def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu)
}];
let arguments = (ins PrefetchCacheLevelAttr:$cacheLevel,
+ UnitAttr:$uniform,
AnyTypeOf<[LLVM_PointerGlobal,
LLVM_PointerLocal,
LLVM_PointerGeneric]>:$addr,
OptionalAttr<CacheEvictionPriorityAttr>:$evictPriority);
- let assemblyFormat = "`level` `=` $cacheLevel `,` $addr (`,` `evict_priority` `=` $evictPriority^)? attr-dict `:` type($addr)";
+ let assemblyFormat = "`level` `=` $cacheLevel (`uniform` $uniform^)? `,` $addr (`,` `evict_priority` `=` $evictPriority^)? attr-dict `:` type($addr)";
let hasVerifier = 1;
let extraClassDeclaration = [{
@@ -2401,27 +2405,6 @@ def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
}];
}
-def NVVM_PrefetchUniformOp : NVVM_Op<"prefetch.uniform"> {
- let summary = "Brings the cache line containing an address into the specified uniform cache level";
- let description = [{
- Operand `addr` must be a generic address pointer and no operation is
- performed if `addr` maps to a `const`, `local`, or `shared` memory location.
-
- The `cacheLevel` attribute specifies the cache level to which the cache line
- containing the specified address is brought. The only supported level is L1.
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu)
- }];
- let arguments = (ins PrefetchCacheLevelAttr:$cacheLevel,
- LLVM_PointerGeneric:$addr);
- let assemblyFormat = "`level` `=` $cacheLevel `,` $addr attr-dict `:` type($addr)";
- let hasVerifier = 1;
-
- let llvmBuilder = [{
- createIntrinsicCall(builder, llvm::Intrinsic::nvvm_prefetchu_L1, $addr);
- }];
-}
-
def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
[DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
Arguments<(ins LLVM_AnyPointer:$tmaDescriptor, PtxPredicate:$predicate)> {
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 6b085892e0b59..31056f34f6e70 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1206,15 +1206,27 @@ LogicalResult NVVM::VoteSyncOp::verify() {
}
LogicalResult NVVM::PrefetchOp::verify() {
+ unsigned addressSpace =
+ llvm::cast<LLVM::LLVMPointerType>(getAddr().getType()).getAddressSpace();
auto evictPriority = getEvictPriority();
+ if (getUniform()) {
+ if (!(getCacheLevel() == NVVM::PrefetchCacheLevel::L1)) {
+ return emitOpError("unsupported cache level, the only supported uniform "
+ "cache level is L1");
+ }
+ if (addressSpace != NVVM::NVVMMemorySpace::kGenericMemorySpace) {
+ return emitOpError(
+ "prefetch to uniform cache requires a generic pointer");
+ }
+ }
+
if (evictPriority && getCacheLevel() != NVVM::PrefetchCacheLevel::L2)
return emitOpError(
"cache eviction priority supported only for cache level L2");
if (evictPriority &&
- (llvm::cast<LLVM::LLVMPointerType>(getAddr().getType())
- .getAddressSpace() != NVVM::NVVMMemorySpace::kGlobalMemorySpace))
+ (addressSpace != NVVM::NVVMMemorySpace::kGlobalMemorySpace))
return emitOpError("cache eviction priority requires a global pointer");
if (evictPriority &&
@@ -1227,13 +1239,6 @@ LogicalResult NVVM::PrefetchOp::verify() {
return success();
}
-LogicalResult NVVM::PrefetchUniformOp::verify() {
- if (getCacheLevel() != NVVM::PrefetchCacheLevel::L1)
- return emitOpError(
- "unsupported cache level, the only supported level is L1");
- return success();
-}
-
/// Packs the given `field` into the `result`.
/// The `result` is 64-bits and each `field` can be 32-bits or narrower.
static llvm::Value *
@@ -1771,6 +1776,13 @@ llvm::Intrinsic::ID PrefetchOp::getIntrinsicID(Operation &op) {
unsigned as = llvm::cast<LLVM::LLVMPointerType>(curOp.getAddr().getType())
.getAddressSpace();
+ if (curOp.getUniform()) {
+ if (cacheLevel == NVVM::PrefetchCacheLevel::L1)
+ return llvm::Intrinsic::nvvm_prefetchu_L1;
+ else
+ llvm_unreachable("Invalid uniform cache level");
+ }
+
if (cacheLevel == NVVM::PrefetchCacheLevel::L1) {
switch (as) {
case NVVM::NVVMMemorySpace::kGenericMemorySpace:
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 8184ec05ffc58..c7fa41c98ac92 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -614,8 +614,8 @@ func.func @prefetch(%gen_ptr: !llvm.ptr, %local_ptr: !llvm.ptr<5>, %global_ptr:
nvvm.prefetch level = L2, %global_ptr, evict_priority = evict_last : !llvm.ptr<1>
// CHECK: nvvm.prefetch level = L2, %{{.*}}
nvvm.prefetch level = L2, %global_ptr, evict_priority = evict_normal : !llvm.ptr<1>
- // CHECK: nvvm.prefetch.uniform level = L1, %{{.*}}
- nvvm.prefetch.uniform level = L1, %gen_ptr : !llvm.ptr
+ // CHECK: nvvm.prefetch level = L1 uniform, %{{.*}}
+ nvvm.prefetch level = L1 uniform, %gen_ptr : !llvm.ptr
return
}
diff --git a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
index b362d26f82e25..f38b7529a7233 100644
--- a/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/prefetch.mlir
@@ -42,6 +42,6 @@ llvm.func @prefetch_L1_uniform(%gen_ptr: !llvm.ptr) {
// CHECK-NEXT: call void @llvm.nvvm.prefetchu.L1(ptr %0)
// CHECK-NEXT: ret void
// CHECK-NEXT: }
- nvvm.prefetch.uniform level = L1, %gen_ptr : !llvm.ptr
+ nvvm.prefetch level = L1 uniform, %gen_ptr : !llvm.ptr
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 6b8ba3a31bb1c..8c4f0aafd36a7 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -300,7 +300,15 @@ llvm.func @nvvm_prefetch_L2_with_invalid_no_allocate(%global_ptr: !llvm.ptr<1>)
// -----
llvm.func @nvvm_prefetch_uniform_with_L2(%gen_ptr: !llvm.ptr) {
- // expected-error @below {{unsupported cache level, the only supported level is L1}}
- nvvm.prefetch.uniform level = L2, %gen_ptr : !llvm.ptr
+ // expected-error @below {{unsupported cache level, the only supported uniform cache level is L1}}
+ nvvm.prefetch level = L2 uniform, %gen_ptr : !llvm.ptr
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_prefetch_uniform_with_invalid_addr_space(%global_ptr: !llvm.ptr<1>) {
+ // expected-error @below {{prefetch to uniform cache requires a generic pointer}}
+ nvvm.prefetch level = L1 uniform, %global_ptr : !llvm.ptr<1>
llvm.return
}
>From 4dba233f53ef76fed16201ef222c4dc0c00155cc Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Wed, 4 Jun 2025 16:10:03 +0530
Subject: [PATCH 3/4] address comments and cleanup
---
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 4 +-
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 103 ++++++++++----------
2 files changed, 52 insertions(+), 55 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 151174a9d2add..0555d4f79cb1f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2380,7 +2380,9 @@ def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
containing the specified address is brought.
`uniform` can be specified after the `cacheLevel` to indicate that the
- prefetch is performed to the specified uniform cache level. If `uniform` is specified, `addr` must be a generic address pointer and no operation is performed if `addr` maps to a `const`, `local`, or `shared` memory location.
+ prefetch is performed to the specified uniform cache level. If `uniform` is
+ specified, `addr` must be a generic address pointer and no operation is
+ performed if `addr` maps to a `const`, `local`, or `shared` memory location.
The `evictPriority` attribute is optional and specifies the cache eviction
priority when `cacheLevel` is L2.
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 31056f34f6e70..59de2e24cfb6e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1206,35 +1206,37 @@ LogicalResult NVVM::VoteSyncOp::verify() {
}
LogicalResult NVVM::PrefetchOp::verify() {
- unsigned addressSpace =
+ using MemSpace = NVVM::NVVMMemorySpace;
+ using CacheLevel = NVVM::PrefetchCacheLevel;
+
+ unsigned as =
llvm::cast<LLVM::LLVMPointerType>(getAddr().getType()).getAddressSpace();
- auto evictPriority = getEvictPriority();
+ std::optional<NVVM::CacheEvictionPriority> evictPriority = getEvictPriority();
if (getUniform()) {
- if (!(getCacheLevel() == NVVM::PrefetchCacheLevel::L1)) {
+ if (getCacheLevel() != CacheLevel::L1)
return emitOpError("unsupported cache level, the only supported uniform "
"cache level is L1");
- }
- if (addressSpace != NVVM::NVVMMemorySpace::kGenericMemorySpace) {
+
+ if (as != MemSpace::kGenericMemorySpace)
return emitOpError(
"prefetch to uniform cache requires a generic pointer");
- }
}
- if (evictPriority && getCacheLevel() != NVVM::PrefetchCacheLevel::L2)
- return emitOpError(
- "cache eviction priority supported only for cache level L2");
+ if (evictPriority) {
+ if (getCacheLevel() != CacheLevel::L2)
+ return emitOpError(
+ "cache eviction priority supported only for cache level L2");
- if (evictPriority &&
- (addressSpace != NVVM::NVVMMemorySpace::kGlobalMemorySpace))
- return emitOpError("cache eviction priority requires a global pointer");
+ if (as != MemSpace::kGlobalMemorySpace)
+ return emitOpError("cache eviction priority requires a global pointer");
- if (evictPriority &&
- *evictPriority != NVVM::CacheEvictionPriority::EvictNormal &&
- *evictPriority != NVVM::CacheEvictionPriority::EvictLast)
- return emitOpError(
- "unsupported cache eviction priority, only evict_last and "
- "evict_normal are supported");
+ if (*evictPriority != NVVM::CacheEvictionPriority::EvictNormal &&
+ *evictPriority != NVVM::CacheEvictionPriority::EvictLast)
+ return emitOpError(
+ "unsupported cache eviction priority, only evict_last and "
+ "evict_normal are supported");
+ }
return success();
}
@@ -1769,52 +1771,45 @@ NVVM::IDArgPair DotAccumulate2WayOp::getIntrinsicIDAndArgs(
}
llvm::Intrinsic::ID PrefetchOp::getIntrinsicID(Operation &op) {
+ using MemSpace = NVVM::NVVMMemorySpace;
+ using CacheLevel = NVVM::PrefetchCacheLevel;
+
auto curOp = llvm::cast<NVVM::PrefetchOp>(op);
- NVVM::PrefetchCacheLevel cacheLevel = curOp.getCacheLevel();
+ NVVM::PrefetchCacheLevel cl = curOp.getCacheLevel();
std::optional<NVVM::CacheEvictionPriority> evictPriority =
curOp.getEvictPriority();
unsigned as = llvm::cast<LLVM::LLVMPointerType>(curOp.getAddr().getType())
.getAddressSpace();
- if (curOp.getUniform()) {
- if (cacheLevel == NVVM::PrefetchCacheLevel::L1)
- return llvm::Intrinsic::nvvm_prefetchu_L1;
- else
- llvm_unreachable("Invalid uniform cache level");
- }
+ if (curOp.getUniform() && cl == CacheLevel::L1)
+ return llvm::Intrinsic::nvvm_prefetchu_L1;
- if (cacheLevel == NVVM::PrefetchCacheLevel::L1) {
- switch (as) {
- case NVVM::NVVMMemorySpace::kGenericMemorySpace:
- return llvm::Intrinsic::nvvm_prefetch_L1;
- case NVVM::NVVMMemorySpace::kGlobalMemorySpace:
- return llvm::Intrinsic::nvvm_prefetch_global_L1;
- case NVVM::NVVMMemorySpace::kLocalMemorySpace:
- return llvm::Intrinsic::nvvm_prefetch_local_L1;
- default:
- llvm_unreachable("Invalid pointer address space");
- }
- } else if (cacheLevel == NVVM::PrefetchCacheLevel::L2) {
- switch (as) {
- case NVVM::NVVMMemorySpace::kGenericMemorySpace:
- return llvm::Intrinsic::nvvm_prefetch_L2;
- case NVVM::NVVMMemorySpace::kGlobalMemorySpace:
- if (evictPriority) {
- if (*evictPriority == NVVM::CacheEvictionPriority::EvictLast)
- return llvm::Intrinsic::nvvm_prefetch_global_L2_evict_last;
- else if (*evictPriority == NVVM::CacheEvictionPriority::EvictNormal)
- return llvm::Intrinsic::nvvm_prefetch_global_L2_evict_normal;
- else
- llvm_unreachable("Invalid cache eviction priority");
- }
- return llvm::Intrinsic::nvvm_prefetch_global_L2;
- case NVVM::NVVMMemorySpace::kLocalMemorySpace:
- return llvm::Intrinsic::nvvm_prefetch_local_L2;
+ if (evictPriority && cl == CacheLevel::L2) {
+ switch (*evictPriority) {
+ case NVVM::CacheEvictionPriority::EvictLast:
+ return llvm::Intrinsic::nvvm_prefetch_global_L2_evict_last;
+ case NVVM::CacheEvictionPriority::EvictNormal:
+ return llvm::Intrinsic::nvvm_prefetch_global_L2_evict_normal;
default:
- llvm_unreachable("Invalid pointer address space");
+ llvm_unreachable("Invalid cache eviction priority");
}
}
- llvm_unreachable("Invalid cache level");
+
+ switch (as) {
+ case MemSpace::kGenericMemorySpace:
+ return cl == CacheLevel::L1 ? llvm::Intrinsic::nvvm_prefetch_L1
+ : llvm::Intrinsic::nvvm_prefetch_L2;
+ case MemSpace::kGlobalMemorySpace:
+ return cl == CacheLevel::L1 ? llvm::Intrinsic::nvvm_prefetch_global_L1
+ : llvm::Intrinsic::nvvm_prefetch_global_L2;
+ case MemSpace::kLocalMemorySpace:
+ return cl == CacheLevel::L1 ? llvm::Intrinsic::nvvm_prefetch_local_L1
+ : llvm::Intrinsic::nvvm_prefetch_local_L2;
+ default:
+ llvm_unreachable("Invalid pointer address space");
+ }
+
+ llvm_unreachable("Invalid parameters for prefetch");
}
//===----------------------------------------------------------------------===//
>From 2e3858d5597dac81e038c2acb060db80bf823773 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Wed, 4 Jun 2025 16:47:51 +0530
Subject: [PATCH 4/4] address comments
---
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 6 ++--
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 38 +++++++++++----------
2 files changed, 23 insertions(+), 21 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 0555d4f79cb1f..026c1fae0eb89 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2354,7 +2354,7 @@ def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp :
}
//===----------------------------------------------------------------------===//
-// NVVM Prefetch Ops
+// NVVM Prefetch Op
//===----------------------------------------------------------------------===//
def PrefetchCacheLevelL1 : I32EnumCase<"L1", 0, "L1">;
@@ -2399,10 +2399,10 @@ def NVVM_PrefetchOp : NVVM_Op<"prefetch"> {
let hasVerifier = 1;
let extraClassDeclaration = [{
- static llvm::Intrinsic::ID getIntrinsicID(Operation &op);
+ static llvm::Intrinsic::ID getIntrinsicID(NVVM::PrefetchOp &op);
}];
let llvmBuilder = [{
- auto intId = NVVM::PrefetchOp::getIntrinsicID(*op);
+ auto intId = NVVM::PrefetchOp::getIntrinsicID(op);
createIntrinsicCall(builder, intId, $addr);
}];
}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 59de2e24cfb6e..58bc0031cfbec 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1209,7 +1209,7 @@ LogicalResult NVVM::PrefetchOp::verify() {
using MemSpace = NVVM::NVVMMemorySpace;
using CacheLevel = NVVM::PrefetchCacheLevel;
- unsigned as =
+ unsigned addressSpace =
llvm::cast<LLVM::LLVMPointerType>(getAddr().getType()).getAddressSpace();
std::optional<NVVM::CacheEvictionPriority> evictPriority = getEvictPriority();
@@ -1218,7 +1218,7 @@ LogicalResult NVVM::PrefetchOp::verify() {
return emitOpError("unsupported cache level, the only supported uniform "
"cache level is L1");
- if (as != MemSpace::kGenericMemorySpace)
+ if (addressSpace != MemSpace::kGenericMemorySpace)
return emitOpError(
"prefetch to uniform cache requires a generic pointer");
}
@@ -1228,7 +1228,7 @@ LogicalResult NVVM::PrefetchOp::verify() {
return emitOpError(
"cache eviction priority supported only for cache level L2");
- if (as != MemSpace::kGlobalMemorySpace)
+ if (addressSpace != MemSpace::kGlobalMemorySpace)
return emitOpError("cache eviction priority requires a global pointer");
if (*evictPriority != NVVM::CacheEvictionPriority::EvictNormal &&
@@ -1770,21 +1770,21 @@ NVVM::IDArgPair DotAccumulate2WayOp::getIntrinsicIDAndArgs(
return {ids[type], args};
}
-llvm::Intrinsic::ID PrefetchOp::getIntrinsicID(Operation &op) {
+llvm::Intrinsic::ID PrefetchOp::getIntrinsicID(NVVM::PrefetchOp &op) {
using MemSpace = NVVM::NVVMMemorySpace;
using CacheLevel = NVVM::PrefetchCacheLevel;
- auto curOp = llvm::cast<NVVM::PrefetchOp>(op);
- NVVM::PrefetchCacheLevel cl = curOp.getCacheLevel();
+ NVVM::PrefetchCacheLevel cacheLevel = op.getCacheLevel();
std::optional<NVVM::CacheEvictionPriority> evictPriority =
- curOp.getEvictPriority();
- unsigned as = llvm::cast<LLVM::LLVMPointerType>(curOp.getAddr().getType())
- .getAddressSpace();
+ op.getEvictPriority();
+ unsigned addressSpace =
+ llvm::cast<LLVM::LLVMPointerType>(op.getAddr().getType())
+ .getAddressSpace();
- if (curOp.getUniform() && cl == CacheLevel::L1)
+ if (op.getUniform() && cacheLevel == CacheLevel::L1)
return llvm::Intrinsic::nvvm_prefetchu_L1;
- if (evictPriority && cl == CacheLevel::L2) {
+ if (evictPriority && cacheLevel == CacheLevel::L2) {
switch (*evictPriority) {
case NVVM::CacheEvictionPriority::EvictLast:
return llvm::Intrinsic::nvvm_prefetch_global_L2_evict_last;
@@ -1795,16 +1795,18 @@ llvm::Intrinsic::ID PrefetchOp::getIntrinsicID(Operation &op) {
}
}
- switch (as) {
+ switch (addressSpace) {
case MemSpace::kGenericMemorySpace:
- return cl == CacheLevel::L1 ? llvm::Intrinsic::nvvm_prefetch_L1
- : llvm::Intrinsic::nvvm_prefetch_L2;
+ return cacheLevel == CacheLevel::L1 ? llvm::Intrinsic::nvvm_prefetch_L1
+ : llvm::Intrinsic::nvvm_prefetch_L2;
case MemSpace::kGlobalMemorySpace:
- return cl == CacheLevel::L1 ? llvm::Intrinsic::nvvm_prefetch_global_L1
- : llvm::Intrinsic::nvvm_prefetch_global_L2;
+ return cacheLevel == CacheLevel::L1
+ ? llvm::Intrinsic::nvvm_prefetch_global_L1
+ : llvm::Intrinsic::nvvm_prefetch_global_L2;
case MemSpace::kLocalMemorySpace:
- return cl == CacheLevel::L1 ? llvm::Intrinsic::nvvm_prefetch_local_L1
- : llvm::Intrinsic::nvvm_prefetch_local_L2;
+ return cacheLevel == CacheLevel::L1
+ ? llvm::Intrinsic::nvvm_prefetch_local_L1
+ : llvm::Intrinsic::nvvm_prefetch_local_L2;
default:
llvm_unreachable("Invalid pointer address space");
}
More information about the Mlir-commits
mailing list