[llvm] [OpenMP] Replace AMDGPU fences with generic scoped fences (PR #119619)
Joseph Huber via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 11 13:57:43 PST 2024
https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/119619
Summary:
This is simpler and more common. I would've replaced the CUDA uses and
made this the same but currently it doesn't codegen these fences fully
and just emits a full system wide barrier as a fallback.
>From d29efa2d2f46bbe46815e9d880ffd439f236fb4f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 11 Dec 2024 15:55:43 -0600
Subject: [PATCH] [OpenMP] Replace AMDGPU fences with generic scoped fences
Summary:
This is simpler and more common. I would've replaced the CUDA uses and
made this the same but currently it doesn't codegen these fences fully
and just emits a full system wide barrier as a fallback.
---
offload/DeviceRTL/include/Synchronization.h | 8 ++++
offload/DeviceRTL/src/Synchronization.cpp | 44 +++------------------
2 files changed, 13 insertions(+), 39 deletions(-)
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index 874974cc861df2..7a73f9ba72877a 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -26,6 +26,14 @@ enum OrderingTy {
seq_cst = __ATOMIC_SEQ_CST,
};
+enum ScopeTy {
+ system = __MEMORY_SCOPE_SYSTEM,
+ device_ = __MEMORY_SCOPE_DEVICE,
+ workgroup = __MEMORY_SCOPE_WRKGRP,
+ wavefront = __MEMORY_SCOPE_WVFRNT,
+ single = __MEMORY_SCOPE_SINGLE,
+};
+
enum MemScopeTy {
all, // All threads on all devices
device, // All threads on the device
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 9ea8d171cc830e..3aee23a865d3cf 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -232,50 +232,16 @@ void namedBarrier() {
fence::team(atomic::release);
}
-// sema checking of amdgcn_fence is aggressive. Intention is to patch clang
-// so that it is usable within a template environment and so that a runtime
-// value of the memory order is expanded to this switch within clang/llvm.
void fenceTeam(atomic::OrderingTy Ordering) {
- switch (Ordering) {
- default:
- __builtin_unreachable();
- case atomic::aquire:
- return __builtin_amdgcn_fence(atomic::aquire, "workgroup");
- case atomic::release:
- return __builtin_amdgcn_fence(atomic::release, "workgroup");
- case atomic::acq_rel:
- return __builtin_amdgcn_fence(atomic::acq_rel, "workgroup");
- case atomic::seq_cst:
- return __builtin_amdgcn_fence(atomic::seq_cst, "workgroup");
- }
+ return __scoped_atomic_thread_fence(Ordering, atomic::workgroup);
}
+
void fenceKernel(atomic::OrderingTy Ordering) {
- switch (Ordering) {
- default:
- __builtin_unreachable();
- case atomic::aquire:
- return __builtin_amdgcn_fence(atomic::aquire, "agent");
- case atomic::release:
- return __builtin_amdgcn_fence(atomic::release, "agent");
- case atomic::acq_rel:
- return __builtin_amdgcn_fence(atomic::acq_rel, "agent");
- case atomic::seq_cst:
- return __builtin_amdgcn_fence(atomic::seq_cst, "agent");
- }
+ return __scoped_atomic_thread_fence(Ordering, atomic::device_);
}
+
void fenceSystem(atomic::OrderingTy Ordering) {
- switch (Ordering) {
- default:
- __builtin_unreachable();
- case atomic::aquire:
- return __builtin_amdgcn_fence(atomic::aquire, "");
- case atomic::release:
- return __builtin_amdgcn_fence(atomic::release, "");
- case atomic::acq_rel:
- return __builtin_amdgcn_fence(atomic::acq_rel, "");
- case atomic::seq_cst:
- return __builtin_amdgcn_fence(atomic::seq_cst, "");
- }
+ return __scoped_atomic_thread_fence(Ordering, atomic::system);
}
void syncWarp(__kmpc_impl_lanemask_t) {
More information about the llvm-commits
mailing list