[flang-commits] [flang] [flang][cuda][NFC] Use NVVM operation for thread syncs (PR #166999)

Fri Nov 7 11:23:26 PST 2025

https://github.com/clementval created https://github.com/llvm/llvm-project/pull/166999

Use the operation introduced in #166698. Also split the test into a new file since `flang/test/Lower/CUDA/cuda-device-proc.cuf` is getting to big. I'm planning to reorganize this file to have better separation of the tests

>From e0c9f2f9732910f36cc5e1f5007669741eb4a232 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 7 Nov 2025 11:21:42 -0800
Subject: [PATCH] [flang][cuda][NFC] Use NVVM operation for thread syncs

---
 .../Optimizer/Builder/CUDAIntrinsicCall.cpp   | 24 +++++--------------
 flang/test/Lower/CUDA/cuda-device-proc.cuf    |  6 -----
 .../test/Lower/CUDA/cuda-synchronization.cuf  | 14 +++++++++++
 3 files changed, 20 insertions(+), 24 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-synchronization.cuf

diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index 4c0d266428632..18b56d384b479 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -1309,34 +1309,22 @@ CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
 // THREADFENCE
 void CUDAIntrinsicLibrary::genThreadFence(
     llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
+  assert(args.size() == 0);
+  mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::GPU);
 }
 
 // THREADFENCE_BLOCK
 void CUDAIntrinsicLibrary::genThreadFenceBlock(
     llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
+  assert(args.size() == 0);
+  mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::CTA);
 }
 
 // THREADFENCE_SYSTEM
 void CUDAIntrinsicLibrary::genThreadFenceSystem(
     llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
+  assert(args.size() == 0);
+  mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::SYS);
 }
 
 // TMA_BULK_COMMIT_GROUP
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 9f8f74a0c7b5e..3a255afd59263 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -22,9 +22,6 @@ attributes(global) subroutine devsub()
 
   call syncthreads()
   call syncwarp(1)
-  call threadfence()
-  call threadfence_block()
-  call threadfence_system()
   ret = syncthreads_and(1)
   res = syncthreads_and(tid > offset)
   ret = syncthreads_count(1)
@@ -106,9 +103,6 @@ end
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
 ! CHECK: nvvm.barrier0
 ! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32 
-! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
diff --git a/flang/test/Lower/CUDA/cuda-synchronization.cuf b/flang/test/Lower/CUDA/cuda-synchronization.cuf
new file mode 100644
index 0000000000000..6e2e23423c360
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-synchronization.cuf
@@ -0,0 +1,14 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran instrinsics lowerings for synchronization.
+
+attributes(global) subroutine sync()
+  call threadfence()
+  call threadfence_block()
+  call threadfence_system()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsync() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: nvvm.memory.barrier <gpu>
+! CHECK: nvvm.memory.barrier <cta>
+! CHECK: nvvm.memory.barrier <sys>