[libc-commits] [libc] [libc] Replace usage of GPU helpers with ones from 'gpuintrin.h' (PR #116454)
via libc-commits
libc-commits at lists.llvm.org
Fri Nov 15 21:47:43 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libc
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
Summary:
These are provided by a resource header now, cut these from the
dependencies and only provide the ones we use for RPC.
---
Full diff: https://github.com/llvm/llvm-project/pull/116454.diff
2 Files Affected:
- (modified) libc/src/__support/RPC/rpc.h (+63-47)
- (modified) libc/src/__support/RPC/rpc_util.h (+72-14)
``````````diff
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index c421dd82b29450..d0f653c384e166 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -19,8 +19,6 @@
#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
#include "rpc_util.h"
-#include "src/__support/CPP/algorithm.h" // max
-#include "src/__support/CPP/atomic.h"
#include "src/__support/CPP/optional.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/macros/config.h"
@@ -30,6 +28,17 @@
namespace LIBC_NAMESPACE_DECL {
namespace rpc {
+/// Use scoped atomic variants if they are available for the target.
+#if !__has_builtin(__scoped_atomic_load_n)
+#define __scoped_atomic_load_n(src, ord, scp) __atomic_load_n(src, ord)
+#define __scoped_atomic_store_n(dst, src, ord, scp) \
+ __atomic_store_n(dst, src, ord)
+#define __scoped_atomic_fetch_or(src, val, ord, scp) \
+ __atomic_fetch_or(src, val, ord)
+#define __scoped_atomic_fetch_and(src, val, ord, scp) \
+ __atomic_fetch_and(src, val, ord)
+#endif
+
/// A fixed size channel used to communicate between the RPC client and server.
struct Buffer {
uint64_t data[8];
@@ -67,18 +76,18 @@ template <bool Invert> struct Process {
LIBC_INLINE ~Process() = default;
uint32_t port_count = 0;
- cpp::Atomic<uint32_t> *inbox = nullptr;
- cpp::Atomic<uint32_t> *outbox = nullptr;
+ uint32_t *inbox = nullptr;
+ uint32_t *outbox = nullptr;
Header *header = nullptr;
Buffer *packet = nullptr;
static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
- cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
+ uint32_t lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
LIBC_INLINE Process(uint32_t port_count, void *buffer)
- : port_count(port_count), inbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
+ : port_count(port_count), inbox(reinterpret_cast<uint32_t *>(
advance(buffer, inbox_offset(port_count)))),
- outbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
+ outbox(reinterpret_cast<uint32_t *>(
advance(buffer, outbox_offset(port_count)))),
header(reinterpret_cast<Header *>(
advance(buffer, header_offset(port_count)))),
@@ -101,16 +110,16 @@ template <bool Invert> struct Process {
/// Retrieve the inbox state from memory shared between processes.
LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
- return gpu::broadcast_value(
- lane_mask,
- inbox[index].load(cpp::MemoryOrder::RELAXED, cpp::MemoryScope::SYSTEM));
+ return rpc::broadcast_value(
+ lane_mask, __scoped_atomic_load_n(&inbox[index], __ATOMIC_RELAXED,
+ __MEMORY_SCOPE_SYSTEM));
}
/// Retrieve the outbox state from memory shared between processes.
LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
- return gpu::broadcast_value(lane_mask,
- outbox[index].load(cpp::MemoryOrder::RELAXED,
- cpp::MemoryScope::SYSTEM));
+ return rpc::broadcast_value(
+ lane_mask, __scoped_atomic_load_n(&outbox[index], __ATOMIC_RELAXED,
+ __MEMORY_SCOPE_SYSTEM));
}
/// Signal to the other process that this one is finished with the buffer.
@@ -119,9 +128,9 @@ template <bool Invert> struct Process {
/// cheaper than calling load_outbox to get the value to store.
LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
uint32_t inverted_outbox = !current_outbox;
- atomic_thread_fence(cpp::MemoryOrder::RELEASE);
- outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED,
- cpp::MemoryScope::SYSTEM);
+ __atomic_thread_fence(__ATOMIC_RELEASE);
+ __scoped_atomic_store_n(&outbox[index], inverted_outbox, __ATOMIC_RELAXED,
+ __MEMORY_SCOPE_SYSTEM);
return inverted_outbox;
}
@@ -133,7 +142,7 @@ template <bool Invert> struct Process {
sleep_briefly();
in = load_inbox(lane_mask, index);
}
- atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+ __atomic_thread_fence(__ATOMIC_ACQUIRE);
}
/// The packet is a linearly allocated array of buffers used to communicate
@@ -153,7 +162,7 @@ template <bool Invert> struct Process {
/// Attempt to claim the lock at index. Return true on lock taken.
/// lane_mask is a bitmap of the threads in the warp that would hold the
- /// single lock on success, e.g. the result of gpu::get_lane_mask()
+ /// single lock on success, e.g. the result of rpc::get_lane_mask()
/// The lock is held when the n-th bit of the lock bitfield is set.
[[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
uint32_t index) {
@@ -165,12 +174,12 @@ template <bool Invert> struct Process {
// There may be threads active which are not in lane mask which must not
// succeed in taking the lock, as otherwise it will leak. This is handled
// by making threads which are not in lane_mask or with 0, a no-op.
- uint32_t id = gpu::get_lane_id();
+ uint32_t id = rpc::get_lane_id();
bool id_in_lane_mask = lane_mask & (1ul << id);
// All threads in the warp call fetch_or. Possibly at the same time.
bool before = set_nth(lock, index, id_in_lane_mask);
- uint64_t packed = gpu::ballot(lane_mask, before);
+ uint64_t packed = rpc::ballot(lane_mask, before);
// If every bit set in lane_mask is also set in packed, every single thread
// in the warp failed to get the lock. Ballot returns unset for threads not
@@ -190,7 +199,7 @@ template <bool Invert> struct Process {
// inlining the current function.
bool holding_lock = lane_mask != packed;
if (holding_lock)
- atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+ __atomic_thread_fence(__ATOMIC_ACQUIRE);
return holding_lock;
}
@@ -199,19 +208,19 @@ template <bool Invert> struct Process {
[[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
uint32_t index) {
// Do not move any writes past the unlock.
- atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+ __atomic_thread_fence(__ATOMIC_RELEASE);
// Use exactly one thread to clear the nth bit in the lock array Must
// restrict to a single thread to avoid one thread dropping the lock, then
// an unrelated warp claiming the lock, then a second thread in this warp
// dropping the lock again.
- clear_nth(lock, index, gpu::is_first_lane(lane_mask));
- gpu::sync_lane(lane_mask);
+ clear_nth(lock, index, rpc::is_first_lane(lane_mask));
+ rpc::sync_lane(lane_mask);
}
/// Number of bytes to allocate for an inbox or outbox.
LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint32_t port_count) {
- return port_count * sizeof(cpp::Atomic<uint32_t>);
+ return port_count * sizeof(uint32_t);
}
/// Number of bytes to allocate for the buffer containing the packets.
@@ -242,24 +251,24 @@ template <bool Invert> struct Process {
}
/// Conditionally set the n-th bit in the atomic bitfield.
- LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
- uint32_t index, bool cond) {
+ LIBC_INLINE static constexpr uint32_t set_nth(uint32_t *bits, uint32_t index,
+ bool cond) {
uint32_t slot = index / NUM_BITS_IN_WORD;
uint32_t bit = index % NUM_BITS_IN_WORD;
- return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
- cpp::MemoryOrder::RELAXED,
- cpp::MemoryScope::DEVICE) &
+ return __scoped_atomic_fetch_or(&bits[slot],
+ static_cast<uint32_t>(cond) << bit,
+ __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE) &
(1u << bit);
}
/// Conditionally clear the n-th bit in the atomic bitfield.
- LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
+ LIBC_INLINE static constexpr uint32_t clear_nth(uint32_t *bits,
uint32_t index, bool cond) {
uint32_t slot = index / NUM_BITS_IN_WORD;
uint32_t bit = index % NUM_BITS_IN_WORD;
- return bits[slot].fetch_and(~0u ^ (static_cast<uint32_t>(cond) << bit),
- cpp::MemoryOrder::RELAXED,
- cpp::MemoryScope::DEVICE) &
+ return __scoped_atomic_fetch_and(&bits[slot],
+ ~0u ^ (static_cast<uint32_t>(cond) << bit),
+ __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE) &
(1u << bit);
}
};
@@ -269,9 +278,9 @@ template <typename F>
LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
uint64_t lane_mask, Buffer *slot) {
if constexpr (is_process_gpu()) {
- fn(&slot[gpu::get_lane_id()], gpu::get_lane_id());
+ fn(&slot[rpc::get_lane_id()], rpc::get_lane_id());
} else {
- for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
+ for (uint32_t i = 0; i < lane_size; i += rpc::get_num_lanes())
if (lane_mask & (1ul << i))
fn(&slot[i], i);
}
@@ -316,7 +325,7 @@ template <bool T> struct Port {
LIBC_INLINE void close() {
// Wait for all lanes to finish using the port.
- gpu::sync_lane(lane_mask);
+ rpc::sync_lane(lane_mask);
// The server is passive, if it own the buffer when it closes we need to
// give ownership back to the client.
@@ -450,7 +459,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
send([&](Buffer *buffer, uint32_t id) {
reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
num_sends = is_process_gpu() ? lane_value(size, id)
- : cpp::max(lane_value(size, id), num_sends);
+ : rpc::max(lane_value(size, id), num_sends);
uint64_t len =
lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
? sizeof(Buffer::data) - sizeof(uint64_t)
@@ -459,7 +468,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
});
uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
uint64_t mask = process.header[index].mask;
- while (gpu::ballot(mask, idx < num_sends)) {
+ while (rpc::ballot(mask, idx < num_sends)) {
send([=](Buffer *buffer, uint32_t id) {
uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
? sizeof(Buffer::data)
@@ -483,7 +492,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
lane_value(dst, id) =
reinterpret_cast<uint8_t *>(alloc(lane_value(size, id)));
num_recvs = is_process_gpu() ? lane_value(size, id)
- : cpp::max(lane_value(size, id), num_recvs);
+ : rpc::max(lane_value(size, id), num_recvs);
uint64_t len =
lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
? sizeof(Buffer::data) - sizeof(uint64_t)
@@ -492,7 +501,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
});
uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
uint64_t mask = process.header[index].mask;
- while (gpu::ballot(mask, idx < num_recvs)) {
+ while (rpc::ballot(mask, idx < num_recvs)) {
recv([=](Buffer *buffer, uint32_t id) {
uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
? sizeof(Buffer::data)
@@ -514,13 +523,13 @@ template <uint16_t opcode>
[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
// Repeatedly perform a naive linear scan for a port that can be opened to
// send data.
- for (uint32_t index = gpu::get_cluster_id();; ++index) {
+ for (uint32_t index = 0;; ++index) {
// Start from the beginning if we run out of ports to check.
if (index >= process.port_count)
index = 0;
// Attempt to acquire the lock on this index.
- uint64_t lane_mask = gpu::get_lane_mask();
+ uint64_t lane_mask = rpc::get_lane_mask();
if (!process.try_lock(lane_mask, index))
continue;
@@ -534,12 +543,12 @@ template <uint16_t opcode>
continue;
}
- if (gpu::is_first_lane(lane_mask)) {
+ if (rpc::is_first_lane(lane_mask)) {
process.header[index].opcode = opcode;
process.header[index].mask = lane_mask;
}
- gpu::sync_lane(lane_mask);
- return Port(process, lane_mask, gpu::get_lane_size(), index, out);
+ rpc::sync_lane(lane_mask);
+ return Port(process, lane_mask, rpc::get_num_lanes(), index, out);
}
}
@@ -549,7 +558,7 @@ template <uint16_t opcode>
Server::try_open(uint32_t lane_size, uint32_t start) {
// Perform a naive linear scan for a port that has a pending request.
for (uint32_t index = start; index < process.port_count; ++index) {
- uint64_t lane_mask = gpu::get_lane_mask();
+ uint64_t lane_mask = rpc::get_lane_mask();
uint32_t in = process.load_inbox(lane_mask, index);
uint32_t out = process.load_outbox(lane_mask, index);
@@ -583,6 +592,13 @@ LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
}
}
+#if !__has_builtin(__scoped_atomic_load_n)
+#undef __scoped_atomic_load_n
+#undef __scoped_atomic_store_n
+#undef __scoped_atomic_fetch_or
+#undef __scoped_atomic_fetch_and
+#endif
+
} // namespace rpc
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index 1a29ed65577148..39e5f30b84ac44 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -10,26 +10,87 @@
#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
#include "src/__support/CPP/type_traits.h"
-#include "src/__support/GPU/utils.h"
-#include "src/__support/macros/attributes.h"
#include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
#include "src/__support/threads/sleep.h"
-#include "src/string/memory_utils/generic/byte_per_byte.h"
-#include "src/string/memory_utils/inline_memcpy.h"
+
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+#include <gpuintrin.h>
+#define RPC_TARGET_IS_GPU
+#endif
namespace LIBC_NAMESPACE_DECL {
namespace rpc {
/// Conditional to indicate if this process is running on the GPU.
LIBC_INLINE constexpr bool is_process_gpu() {
-#if defined(LIBC_TARGET_ARCH_IS_GPU)
+#ifdef RPC_TARGET_IS_GPU
return true;
#else
return false;
#endif
}
+/// Wait for all lanes in the group to complete.
+LIBC_INLINE void sync_lane(uint64_t lane_mask) {
+#ifdef RPC_TARGET_IS_GPU
+ return __gpu_sync_lane(lane_mask);
+#endif
+}
+
+/// Copies the value from the first active thread to the rest.
+LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
+#ifdef RPC_TARGET_IS_GPU
+ return __gpu_read_first_lane_u32(lane_mask, x);
+#else
+ return x;
+#endif
+}
+
+/// Returns the number lanes that participate in the RPC interface.
+LIBC_INLINE uint32_t get_num_lanes() {
+#ifdef RPC_TARGET_IS_GPU
+ return __gpu_num_lanes();
+#else
+ return 1;
+#endif
+}
+
+/// Returns the id of the thread inside of an AMD wavefront executing together.
+LIBC_INLINE uint64_t get_lane_mask() {
+#ifdef RPC_TARGET_IS_GPU
+ return __gpu_lane_mask();
+#else
+ return 1;
+#endif
+}
+
+/// Returns the id of the thread inside of an AMD wavefront executing together.
+LIBC_INLINE uint32_t get_lane_id() {
+#ifdef RPC_TARGET_IS_GPU
+ return __gpu_lane_id();
+#else
+ return 0;
+#endif
+}
+
+/// Conditional that is only true for a single thread in a lane.
+LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
+#ifdef RPC_TARGET_IS_GPU
+ return __gpu_is_first_in_lane(lane_mask);
+#else
+ return true;
+#endif
+}
+
+/// Returns a bitmask of threads in the current lane for which \p x is true.
+LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+#ifdef RPC_TARGET_IS_GPU
+ return __gpu_ballot(lane_mask, x);
+#else
+ return x;
+#endif
+}
+
/// Return \p val aligned "upwards" according to \p align.
template <typename V, typename A>
LIBC_INLINE constexpr V align_up(V val, A align) {
@@ -57,14 +118,11 @@ template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
/// Wrapper around the optimal memory copy implementation for the target.
LIBC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) {
- // The built-in memcpy prefers to fully unroll loops. We want to minimize
- // resource usage so we use a single nounroll loop implementation.
-#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
- inline_memcpy_byte_per_byte(reinterpret_cast<Ptr>(dst),
- reinterpret_cast<CPtr>(src), count);
-#else
- inline_memcpy(dst, src, count);
-#endif
+ __builtin_memcpy(dst, src, count);
+}
+
+template <class T> LIBC_INLINE constexpr const T &max(const T &a, const T &b) {
+ return (a < b) ? b : a;
}
} // namespace rpc
``````````
</details>
https://github.com/llvm/llvm-project/pull/116454
More information about the libc-commits
mailing list