[flang-commits] [flang] 873b8d5 - [flang][cuda][NFC] Use NVVM operation for thread syncs (#166999)
via flang-commits
flang-commits at lists.llvm.org
Fri Nov 7 11:45:46 PST 2025
Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-11-07T11:45:42-08:00
New Revision: 873b8d502a8af9d92c5b3bfa47c1dd68e609c6eb
URL: https://github.com/llvm/llvm-project/commit/873b8d502a8af9d92c5b3bfa47c1dd68e609c6eb
DIFF: https://github.com/llvm/llvm-project/commit/873b8d502a8af9d92c5b3bfa47c1dd68e609c6eb.diff
LOG: [flang][cuda][NFC] Use NVVM operation for thread syncs (#166999)
Use the operation introduced in #166698. Also split the test into a new
file since `flang/test/Lower/CUDA/cuda-device-proc.cuf` is getting to
big. I'm planning to reorganize this file to have better separation of
the tests
Added:
flang/test/Lower/CUDA/cuda-synchronization.cuf
Modified:
flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
flang/test/Lower/CUDA/cuda-device-proc.cuf
Removed:
################################################################################
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index 4c0d266428632..18b56d384b479 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -1309,34 +1309,22 @@ CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
// THREADFENCE
void CUDAIntrinsicLibrary::genThreadFence(
llvm::ArrayRef<fir::ExtendedValue> args) {
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
- mlir::FunctionType funcType =
- mlir::FunctionType::get(builder.getContext(), {}, {});
- auto funcOp = builder.createFunction(loc, funcName, funcType);
- llvm::SmallVector<mlir::Value> noArgs;
- fir::CallOp::create(builder, loc, funcOp, noArgs);
+ assert(args.size() == 0);
+ mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::GPU);
}
// THREADFENCE_BLOCK
void CUDAIntrinsicLibrary::genThreadFenceBlock(
llvm::ArrayRef<fir::ExtendedValue> args) {
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
- mlir::FunctionType funcType =
- mlir::FunctionType::get(builder.getContext(), {}, {});
- auto funcOp = builder.createFunction(loc, funcName, funcType);
- llvm::SmallVector<mlir::Value> noArgs;
- fir::CallOp::create(builder, loc, funcOp, noArgs);
+ assert(args.size() == 0);
+ mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::CTA);
}
// THREADFENCE_SYSTEM
void CUDAIntrinsicLibrary::genThreadFenceSystem(
llvm::ArrayRef<fir::ExtendedValue> args) {
- constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
- mlir::FunctionType funcType =
- mlir::FunctionType::get(builder.getContext(), {}, {});
- auto funcOp = builder.createFunction(loc, funcName, funcType);
- llvm::SmallVector<mlir::Value> noArgs;
- fir::CallOp::create(builder, loc, funcOp, noArgs);
+ assert(args.size() == 0);
+ mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::SYS);
}
// TMA_BULK_COMMIT_GROUP
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 9f8f74a0c7b5e..3a255afd59263 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -22,9 +22,6 @@ attributes(global) subroutine devsub()
call syncthreads()
call syncwarp(1)
- call threadfence()
- call threadfence_block()
- call threadfence_system()
ret = syncthreads_and(1)
res = syncthreads_and(tid > offset)
ret = syncthreads_count(1)
@@ -106,9 +103,6 @@ end
! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
! CHECK: nvvm.barrier0
! CHECK: nvvm.bar.warp.sync %c1{{.*}} : i32
-! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
-! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
diff --git a/flang/test/Lower/CUDA/cuda-synchronization.cuf b/flang/test/Lower/CUDA/cuda-synchronization.cuf
new file mode 100644
index 0000000000000..6e2e23423c360
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-synchronization.cuf
@@ -0,0 +1,14 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran instrinsics lowerings for synchronization.
+
+attributes(global) subroutine sync()
+ call threadfence()
+ call threadfence_block()
+ call threadfence_system()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsync() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: nvvm.memory.barrier <gpu>
+! CHECK: nvvm.memory.barrier <cta>
+! CHECK: nvvm.memory.barrier <sys>
More information about the flang-commits
mailing list