[flang-commits] [flang] 873b8d5 - [flang][cuda][NFC] Use NVVM operation for thread syncs (#166999)

Fri Nov 7 11:45:46 PST 2025

Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-11-07T11:45:42-08:00
New Revision: 873b8d502a8af9d92c5b3bfa47c1dd68e609c6eb

URL: https://github.com/llvm/llvm-project/commit/873b8d502a8af9d92c5b3bfa47c1dd68e609c6eb
DIFF: https://github.com/llvm/llvm-project/commit/873b8d502a8af9d92c5b3bfa47c1dd68e609c6eb.diff

LOG: [flang][cuda][NFC] Use NVVM operation for thread syncs (#166999)

Use the operation introduced in #166698. Also split the test into a new
file since `flang/test/Lower/CUDA/cuda-device-proc.cuf` is getting to
big. I'm planning to reorganize this file to have better separation of
the tests

Added: 
    flang/test/Lower/CUDA/cuda-synchronization.cuf

Modified: 
    flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
    flang/test/Lower/CUDA/cuda-device-proc.cuf

Removed: 
    


################################################################################
diff  --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index 4c0d266428632..18b56d384b479 100644

--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -1309,34 +1309,22 @@ CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
 // THREADFENCE
 void CUDAIntrinsicLibrary::genThreadFence(
     llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
+  assert(args.size() == 0);
+  mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::GPU);
 }
 
 // THREADFENCE_BLOCK
 void CUDAIntrinsicLibrary::genThreadFenceBlock(
     llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
+  assert(args.size() == 0);
+  mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::CTA);
 }
 
 // THREADFENCE_SYSTEM
 void CUDAIntrinsicLibrary::genThreadFenceSystem(
     llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
+  assert(args.size() == 0);
+  mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::SYS);
 }
 
 // TMA_BULK_COMMIT_GROUP

diff  --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 9f8f74a0c7b5e..3a255afd59263 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -22,9 +22,6 @@ attributes(global) subroutine devsub()
 
   call syncthreads()
   call syncwarp(1)
-  call threadfence()
-  call threadfence_block()
-  call threadfence_system()
   ret = syncthreads_and(1)
   res = syncthreads_and(tid > offset)
   ret = syncthreads_count(1)
@@ -106,9 +103,6 @@ end
 ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
 ! CHECK: nvvm.barrier0
 ! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32 
-! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
 ! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>

diff  --git a/flang/test/Lower/CUDA/cuda-synchronization.cuf b/flang/test/Lower/CUDA/cuda-synchronization.cuf
new file mode 100644
index 0000000000000..6e2e23423c360
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-synchronization.cuf
@@ -0,0 +1,14 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran instrinsics lowerings for synchronization.
+
+attributes(global) subroutine sync()
+  call threadfence()
+  call threadfence_block()
+  call threadfence_system()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsync() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: nvvm.memory.barrier <gpu>
+! CHECK: nvvm.memory.barrier <cta>
+! CHECK: nvvm.memory.barrier <sys>