[flang-commits] [flang] [flang][cuda][NFC] Simplify thread fence lowering (PR #167009)
Valentin Clement バレンタイン クレメン via flang-commits
flang-commits at lists.llvm.org
Fri Nov 7 11:52:52 PST 2025
https://github.com/clementval created https://github.com/llvm/llvm-project/pull/167009
Just use a single templated function to generate the 3 kind of thread fence so we can remove duplicated code.
>From fa56a80edd4dfe8e3f5245d5efc2ad2be2bbf48b Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 7 Nov 2025 11:51:57 -0800
Subject: [PATCH] [flang][cuda][NFC] Simplify thread fence lowering
---
.../Optimizer/Builder/CUDAIntrinsicCall.h | 3 +--
.../Optimizer/Builder/CUDAIntrinsicCall.cpp | 25 +++++--------------
2 files changed, 7 insertions(+), 21 deletions(-)
diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
index d735ce95a83dc..ae7d566920656 100644
--- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -63,9 +63,8 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
+ template <mlir::NVVM::MemScopeKind scope>
void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
- void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>);
- void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index 18b56d384b479..323d1ef78e65d 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -472,17 +472,17 @@ static constexpr IntrinsicHandler cudaHandlers[]{
/*isElemental=*/false},
{"threadfence",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
- &CI::genThreadFence),
+ &CI::genThreadFence<mlir::NVVM::MemScopeKind::GPU>),
{},
/*isElemental=*/false},
{"threadfence_block",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
- &CI::genThreadFenceBlock),
+ &CI::genThreadFence<mlir::NVVM::MemScopeKind::CTA>),
{},
/*isElemental=*/false},
{"threadfence_system",
static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
- &CI::genThreadFenceSystem),
+ &CI::genThreadFence<mlir::NVVM::MemScopeKind::SYS>),
{},
/*isElemental=*/false},
{"tma_bulk_commit_group",
@@ -1306,25 +1306,12 @@ CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
return res;
}
-// THREADFENCE
+// THREADFENCE, THREADFENCE_BLOCK, THREADFENCE_SYSTEM
+template <mlir::NVVM::MemScopeKind scope>
void CUDAIntrinsicLibrary::genThreadFence(
llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 0);
- mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::GPU);
-}
-
-// THREADFENCE_BLOCK
-void CUDAIntrinsicLibrary::genThreadFenceBlock(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 0);
- mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::CTA);
-}
-
-// THREADFENCE_SYSTEM
-void CUDAIntrinsicLibrary::genThreadFenceSystem(
- llvm::ArrayRef<fir::ExtendedValue> args) {
- assert(args.size() == 0);
- mlir::NVVM::MembarOp::create(builder, loc, mlir::NVVM::MemScopeKind::SYS);
+ mlir::NVVM::MembarOp::create(builder, loc, scope);
}
// TMA_BULK_COMMIT_GROUP
More information about the flang-commits
mailing list