[Mlir-commits] [mlir] [mlir][NVVM] Add support for few more fence Ops (PR #170251)

Mon Dec 1 23:09:39 PST 2025

https://github.com/schwarzschild-radius created https://github.com/llvm/llvm-project/pull/170251

This commit adds support for the following fence Ops:

- fence.acquire.sync_restrict
- fence.release.sync_restrict
- fence.proxy.acquire.sync_restrict
- fence.proxy.release.sync_restrict

and removes fence.sc.cluster. The commit also moves memory.barrier into the Membar/Fence section

>From 0c128f8aa9f970ef41c06b31993d207c5f968cca Mon Sep 17 00:00:00 2001
From: Pradeep Kumar <pradeepku at nvidia.com>
Date: Mon, 1 Dec 2025 07:14:22 +0000
Subject: [PATCH] [mlir][NVVM] Add support for few more fence Ops

This commit adds support for the following fence Ops:

- fence.acquire.sync_restrict
- fence.release.sync_restrict
- fence.proxy.acquire.sync_restrict
- fence.proxy.release.sync_restrict

and removes fence.sc.cluster. The commit also moves memory.barrier into the Membar/Fence section
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   | 160 +++++++++++++-----
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp    |  20 ++-
 .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp  |  19 +++
 .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir   |   7 -
 mlir/test/Dialect/LLVMIR/nvvm.mlir            |   7 -
 mlir/test/Target/LLVMIR/nvvm/fence.mlir       |  78 +++++++++
 mlir/test/Target/LLVMIR/nvvmir.mlir           |  36 ----
 7 files changed, 230 insertions(+), 97 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/nvvm/fence.mlir

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index b3395b7e0a24e..95bf5709030e2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1319,11 +1319,70 @@ def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait", [NVVMRequiresSM<90>]> {
   let assemblyFormat = "attr-dict";
 }
 
-def NVVM_FenceScClusterOp : NVVM_Op<"fence.sc.cluster"> {
-  string llvmBuilder = [{
-      createIntrinsicCall(builder, llvm::Intrinsic::nvvm_fence_sc_cluster);
+//===----------------------------------------------------------------------===//
+// NVVM Member/Fence
+//===----------------------------------------------------------------------===//
+
+def NVVM_MembarOp : NVVM_Op<"memory.barrier">,
+                    Arguments<(ins MemScopeKindAttr:$scope)> {
+  let summary = "Memory barrier operation";
+  let description = [{
+    `membar` operation guarantees that prior memory accesses requested by this
+    thread are performed at the specified `scope`, before later memory
+    operations requested by this thread following the membar instruction.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar)
+  }];
+
+  let assemblyFormat = "$scope attr-dict";
+  let llvmBuilder = [{
+    createIntrinsicCall(builder, getMembarIntrinsicID($scope));
+  }];
+}
+
+def NVVM_FenceAcquireSyncRestrictOp : NVVM_Op<"fence.acquire.sync_restrict"> {
+  let summary = "Uni-directional thread fence operation with acquire semantics";
+  let description = [{
+    The `nvvm.fence.acquire.sync_restrict` Op restricts the class of memory
+    operations for which the fence instruction provides the memory ordering guarantees.
+    `sync_restrict` restricts `acquire` memory semantics to `shared_cluster` with cluster scope.
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
   }];
+
   let assemblyFormat = "attr-dict";
+  let llvmBuilder = [{
+    createIntrinsicCall(builder,
+      llvm::Intrinsic::nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster);
+  }];
+}
+
+def NVVM_FenceReleaseSyncRestrictOp : NVVM_Op<"fence.release.sync_restrict"> {
+  let summary = "Uni-directional thread fence operation with release semantics";
+  let description = [{
+    The `nvvm.fence.release.sync_restrict` Op restricts the class of memory
+    operations for which the fence instruction provides the memory ordering guarantees.
+    `sync_restrict` restricts `release` memory semantics to `shared_cta` with cluster scope.
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+  }];
+
+  let assemblyFormat = "attr-dict";
+  let llvmBuilder = [{
+    createIntrinsicCall(builder,
+      llvm::Intrinsic::nvvm_fence_release_sync_restrict_space_cta_scope_cluster);
+  }];
+}
+
+def NVVM_FenceMbarrierInitOp : NVVM_Op<"fence.mbarrier.init"> {
+    let description = [{
+    Fence operation that applies on the prior nvvm.mbarrier.init
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+  }];
+
+  let assemblyFormat = "attr-dict";
+  let llvmBuilder = [{
+    createIntrinsicCall(builder, llvm::Intrinsic::nvvm_fence_mbarrier_init_release_cluster);
+  }];
 }
 
 def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
@@ -1339,10 +1398,15 @@ def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind",
 }
 
 def ProxyKindAttr : EnumAttr<NVVM_Dialect, ProxyKind, "proxy_kind"> {
+  let description = [{
+    ProxyKind attribute represents a memory proxy which is an abstract label
+    applied to a method of memory access. When two memory operations use distinct
+    methods of memory access, they are said to be different proxies.
+  }];
   let assemblyFormat = "`<` $value `>`";
 }
 
-def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
+def NVVM_FenceProxyOp : NVVM_Op<"fence.proxy">,
   Arguments<(ins ProxyKindAttr:$kind,
                  OptionalAttr<SharedSpaceAttr>:$space)> {
   let description = [{
@@ -1353,16 +1417,11 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
   }];
   
   let assemblyFormat = "attr-dict";
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() {
-      std::string ptx = "fence.proxy.";
-      ptx += stringifyProxyKind(getKind());
-      if(getKind() == NVVM::ProxyKind::async_shared)
-        { ptx += "::"; ptx += stringifySharedSpace(getSpace().value()); }
-      ptx += ";";
-      return ptx;
-    }
+
+  let llvmBuilder = [{
+    createIntrinsicCall(builder, getFenceProxyID($kind, $space));
   }];
+
   let hasVerifier = 1;
 }
 
@@ -1399,23 +1458,6 @@ def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
   let hasVerifier = 1;
 }
 
-def NVVM_MembarOp : NVVM_Op<"memory.barrier">,
-                    Arguments<(ins MemScopeKindAttr:$scope)> {
-  let summary = "Memory barrier operation";
-  let description = [{
-    `membar` operation guarantees that prior memory accesses requested by this
-    thread are performed at the specified `scope`, before later memory
-    operations requested by this thread following the membar instruction.
-
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar)
-  }];
-
-  let assemblyFormat = "$scope attr-dict";
-  let llvmBuilder = [{
-    createIntrinsicCall(builder, getMembarIntrinsicID($scope), {});
-  }];
-}
-
 def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
       Arguments<(ins MemScopeKindAttr:$scope,
                      DefaultValuedAttr<ProxyKindAttr,
@@ -1442,6 +1484,48 @@ def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
   let hasVerifier = 1;
 }
 
+def NVVM_FenceProxyAcquireSyncRestrictOp : NVVM_Op<"fence.proxy.acquire.sync_restrict">,
+      Arguments<(ins DefaultValuedAttr<ProxyKindAttr, "ProxyKind::GENERIC">:$fromProxy,
+                     DefaultValuedAttr<ProxyKindAttr, "ProxyKind::async">:$toProxy)> {
+  let summary = "Uni-directional thread fence operation with acquire semantics";
+  let description = [{
+    The `nvvm.fence.proxy.acquire.sync_restrict` Op used to establish
+    ordering between a prior memory access performed between proxies. Currently,
+    the ordering is only supported between async and generic proxies. `sync_restrict`
+    restricts `acquire` memory semantics to `shared_cluster` with cluster scope.
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+  }];
+
+  let assemblyFormat = "attr-dict";
+  let llvmBuilder = [{
+    createIntrinsicCall(builder,
+      llvm::Intrinsic::nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster);
+  }];
+
+  let hasVerifier = 1;
+}
+
+def NVVM_FenceProxyReleaseSyncRestrictOp : NVVM_Op<"fence.proxy.release.sync_restrict">,
+      Arguments<(ins DefaultValuedAttr<ProxyKindAttr, "ProxyKind::GENERIC">:$fromProxy,
+                     DefaultValuedAttr<ProxyKindAttr, "ProxyKind::async">:$toProxy)> {
+  let summary = "Uni-directional thread fence operation with release semantics";
+  let description = [{
+    The `nvvm.fence.proxy.release.sync_restrict` Op used to establish
+    ordering between a prior memory access performed between proxies. Currently,
+    the ordering is only supported between async and generic proxies. `sync_restrict`
+    restricts `release` memory semantics to `shared_cta` with cluster scope.
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+  }];
+
+  let assemblyFormat = "attr-dict";
+  let llvmBuilder = [{
+    createIntrinsicCall(builder,
+      llvm::Intrinsic::nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster);
+  }];
+
+  let hasVerifier = 1;
+}
+
 def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>;
 def SetMaxRegisterActionDecrease   : I32EnumAttrCase<"decrease", 1>;
 def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action",
@@ -1464,22 +1548,6 @@ def NVVM_SetMaxRegisterOp : NVVM_Op<"setmaxregister"> {
   }];
 }
 
-def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> {
-  let arguments = (ins );
-    let description = [{
-    Fence operation that applies on the prior nvvm.mbarrier.init
-    
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
-  }];
-  
-  let assemblyFormat = "attr-dict";
-  let extraClassDefinition = [{        
-    std::string $cppClass::getPtx() {
-      return std::string("fence.mbarrier_init.release.cluster;");
-    }
-  }];
-}
-
 def ShflKindBfly : I32EnumAttrCase<"bfly", 0>;
 def ShflKindUp   : I32EnumAttrCase<"up", 1>;
 def ShflKindDown : I32EnumAttrCase<"down", 2>;
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 413125245aca8..0954a82d12bea 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -2112,7 +2112,6 @@ LogicalResult NVVM::FenceProxyAcquireOp::verify() {
   if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
     return emitOpError("uni-directional proxies only support tensormap "
                        "for to_proxy attribute");
-
   return success();
 }
 
@@ -2124,7 +2123,26 @@ LogicalResult NVVM::FenceProxyReleaseOp::verify() {
   if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
     return emitOpError("uni-directional proxies only support tensormap "
                        "for to_proxy attribute");
+  return success();
+}
+
+LogicalResult NVVM::FenceProxyAcquireSyncRestrictOp::verify() {
+  if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+    return emitOpError("uni-directional proxies only support generic for "
+                       "from_proxy attribute");
+
+  if (getToProxy() != NVVM::ProxyKind::async)
+    return emitOpError("only async is supported for to_proxy attribute");
+  return success();
+}
+
+LogicalResult NVVM::FenceProxyReleaseSyncRestrictOp::verify() {
+  if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+    return emitOpError("uni-directional proxies only support generic for "
+                       "from_proxy attribute");
 
+  if (getToProxy() != NVVM::ProxyKind::async)
+    return emitOpError("only async is supported for to_proxy attribute");
   return success();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index cecff51e637a5..95d41d05658e6 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -411,6 +411,25 @@ getTcgen05StIntrinsicID(mlir::NVVM::Tcgen05LdStShape shape, uint32_t num) {
   llvm_unreachable("unhandled tcgen05.st lowering");
 }
 
+static llvm::Intrinsic::ID getFenceProxyID(NVVM::ProxyKind kind,
+                                           std::optional<NVVM::SharedSpace> space) {
+  switch (kind) {
+  case NVVM::ProxyKind::alias:
+    return llvm::Intrinsic::nvvm_fence_proxy_alias;
+  case NVVM::ProxyKind::async:
+    return llvm::Intrinsic::nvvm_fence_proxy_async;
+  case NVVM::ProxyKind::async_global:
+    return llvm::Intrinsic::nvvm_fence_proxy_async_global;
+  case NVVM::ProxyKind::async_shared:
+    if (*space == NVVM::SharedSpace::shared_cta)
+      return llvm::Intrinsic::nvvm_fence_proxy_async_shared_cta;
+    else
+      return llvm::Intrinsic::nvvm_fence_proxy_async_shared_cluster;
+  default:
+    llvm_unreachable("unsupported proxy kind for fence.proxy Op");
+  }
+}
+
 namespace {
 /// Implementation of the dialect interface that converts operations belonging
 /// to the NVVM dialect to LLVM IR.
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index fbf8d9efb3bc7..442046b9f3302 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -584,13 +584,6 @@ func.func @cp_async_bulk_wait_group() {
 
 // -----
 
-func.func @fence_mbarrier_init() {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;"
-  nvvm.fence.mbarrier.init
-  func.return 
-}
-// -----
-
 func.func @fence_proxy() {
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.alias;", ""  : () -> ()
   nvvm.fence.proxy { kind = #nvvm.proxy_kind<alias>}
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index cd7bd37da5763..3bd9d9dd390e5 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -92,13 +92,6 @@ func.func @llvm_nvvm_cluster_wait() {
   llvm.return
 }
 
-// CHECK-LABEL: @llvm_nvvm_fence_sc_cluster
-func.func @llvm_nvvm_fence_sc_cluster() {
-  // CHECK: nvvm.fence.sc.cluster
-  nvvm.fence.sc.cluster
-  llvm.return
-}
-
 // CHECK-LABEL: @nvvm_shfl
 func.func @nvvm_shfl(
     %arg0 : i32, %arg1 : i32, %arg2 : i32,
diff --git a/mlir/test/Target/LLVMIR/nvvm/fence.mlir b/mlir/test/Target/LLVMIR/nvvm/fence.mlir
new file mode 100644
index 0000000000000..26ca922e178f2
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/fence.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: @nvvm_fence_sync_restrict
+llvm.func @nvvm_fence_sync_restrict() {
+  // CHECK: call void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster()
+  nvvm.fence.acquire.sync_restrict
+  // CHECK: call void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster()
+  nvvm.fence.release.sync_restrict
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_fence_proxy
+llvm.func @nvvm_fence_proxy() {
+  // CHECK: call void @llvm.nvvm.fence.proxy.alias()
+  nvvm.fence.proxy {kind = #nvvm.proxy_kind<alias>}
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.async()
+  nvvm.fence.proxy {kind = #nvvm.proxy_kind<async>}
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.async.global()
+  nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.global>}
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.async.shared_cta()
+  nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.async.shared_cluster()
+  nvvm.fence.proxy {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cluster>}
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_fence_proxy_sync_restrict
+llvm.func @nvvm_fence_proxy_sync_restrict() {
+  // CHECK: call void @llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster()
+  nvvm.fence.proxy.acquire.sync_restrict
+  // CHECK: call void @llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster()
+  nvvm.fence.proxy.release.sync_restrict
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
+llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
+  nvvm.fence.proxy.release #nvvm.mem_scope<cta>
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
+  nvvm.fence.proxy.release #nvvm.mem_scope<cluster>
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
+  nvvm.fence.proxy.release #nvvm.mem_scope<gpu>
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
+  nvvm.fence.proxy.release #nvvm.mem_scope<sys>
+  llvm.return
+}
+
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
+llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
+  %c128 = llvm.mlir.constant(128) : i32
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128
+
+  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
+  nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
+  llvm.return
+}
+
+// CHECK-LABEL: @fence_mbarrier_init
+llvm.func @fence_mbarrier_init() {
+  // CHECK: call void @llvm.nvvm.fence.mbarrier_init.release.cluster()
+  nvvm.fence.mbarrier.init
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 5cba5c4fceefd..c4a69097692cb 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -699,42 +699,6 @@ llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant},
   llvm.return
 }
 
-
-// -----
-// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
-llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
-  %c128 = llvm.mlir.constant(128) : i32
-  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
-  nvvm.fence.proxy.release #nvvm.mem_scope<cta>
-
-  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
-  nvvm.fence.proxy.release #nvvm.mem_scope<cluster>
-
-  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
-  nvvm.fence.proxy.release #nvvm.mem_scope<gpu>
-
-  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
-  nvvm.fence.proxy.release #nvvm.mem_scope<sys>
-  llvm.return
-}
-
-// -----
-// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
-llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
-  %c128 = llvm.mlir.constant(128) : i32
-  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
-  nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128
-
-  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
-  nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128
-
-  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
-  nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128
-
-  // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
-  nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
-  llvm.return
-}
 // -----
 
 // CHECK-LABEL: @nvvm_exit