[libc-commits] [libc] [libc] Pull last dependencies into rpc_util.h (PR #116693)

Mon Nov 18 13:35:38 PST 2024

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/116693

Summary:
Last bit in-place to remove the dependencies on LLVM libc headers. This
just pulls the `sleep_briefly`, `std::optinal` and `type_traits` definitions into the
`rpc_util.h` header. This duplicates some code for now but will soon be
moved into the `include/rpc` directory. At that point I will remove all
the `LIBC_INLINE` and just make it `RPC_INLINE`. Internal use will then
have a wrapper to make it all LIBC namespaced, implementations will then
implement their own handling.


>From d175ef5c922021624d7c1b5c87cbef46d598d29d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 15 Nov 2024 18:13:58 -0600
Subject: [PATCH 1/4] [libc] Replace usage of GPU helpers with ones from
 'gpuintrin.h'

Summary:
These are provided by a resource header now, cut these from the
dependencies and only provide the ones we use for RPC.
---
 libc/src/__support/RPC/rpc.h      | 36 ++++++++--------
 libc/src/__support/RPC/rpc_util.h | 69 ++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 20 deletions(-)

diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index a257003a907de8..1482230d73c7c1 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -110,14 +110,14 @@ template <bool Invert> struct Process {
 
   /// Retrieve the inbox state from memory shared between processes.
   LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
-    return gpu::broadcast_value(
+    return rpc::broadcast_value(
         lane_mask, __scoped_atomic_load_n(&inbox[index], __ATOMIC_RELAXED,
                                           __MEMORY_SCOPE_SYSTEM));
   }
 
   /// Retrieve the outbox state from memory shared between processes.
   LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
-    return gpu::broadcast_value(
+    return rpc::broadcast_value(
         lane_mask, __scoped_atomic_load_n(&outbox[index], __ATOMIC_RELAXED,
                                           __MEMORY_SCOPE_SYSTEM));
   }
@@ -162,7 +162,7 @@ template <bool Invert> struct Process {
 
   /// Attempt to claim the lock at index. Return true on lock taken.
   /// lane_mask is a bitmap of the threads in the warp that would hold the
-  /// single lock on success, e.g. the result of gpu::get_lane_mask()
+  /// single lock on success, e.g. the result of rpc::get_lane_mask()
   /// The lock is held when the n-th bit of the lock bitfield is set.
   LIBC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) {
     // On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
@@ -173,12 +173,12 @@ template <bool Invert> struct Process {
     // There may be threads active which are not in lane mask which must not
     // succeed in taking the lock, as otherwise it will leak. This is handled
     // by making threads which are not in lane_mask or with 0, a no-op.
-    uint32_t id = gpu::get_lane_id();
+    uint32_t id = rpc::get_lane_id();
     bool id_in_lane_mask = lane_mask & (1ul << id);
 
     // All threads in the warp call fetch_or. Possibly at the same time.
     bool before = set_nth(lock, index, id_in_lane_mask);
-    uint64_t packed = gpu::ballot(lane_mask, before);
+    uint64_t packed = rpc::ballot(lane_mask, before);
 
     // If every bit set in lane_mask is also set in packed, every single thread
     // in the warp failed to get the lock. Ballot returns unset for threads not
@@ -212,8 +212,8 @@ template <bool Invert> struct Process {
     // restrict to a single thread to avoid one thread dropping the lock, then
     // an unrelated warp claiming the lock, then a second thread in this warp
     // dropping the lock again.
-    clear_nth(lock, index, gpu::is_first_lane(lane_mask));
-    gpu::sync_lane(lane_mask);
+    clear_nth(lock, index, rpc::is_first_lane(lane_mask));
+    rpc::sync_lane(lane_mask);
   }
 
   /// Number of bytes to allocate for an inbox or outbox.
@@ -276,9 +276,9 @@ template <typename F>
 LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
                                    uint64_t lane_mask, Buffer *slot) {
   if constexpr (is_process_gpu()) {
-    fn(&slot[gpu::get_lane_id()], gpu::get_lane_id());
+    fn(&slot[rpc::get_lane_id()], rpc::get_lane_id());
   } else {
-    for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
+    for (uint32_t i = 0; i < lane_size; i += rpc::get_num_lanes())
       if (lane_mask & (1ul << i))
         fn(&slot[i], i);
   }
@@ -323,7 +323,7 @@ template <bool T> struct Port {
 
   LIBC_INLINE void close() {
     // Wait for all lanes to finish using the port.
-    gpu::sync_lane(lane_mask);
+    rpc::sync_lane(lane_mask);
 
     // The server is passive, if it own the buffer when it closes we need to
     // give ownership back to the client.
@@ -466,7 +466,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
   uint64_t mask = process.header[index].mask;
-  while (gpu::ballot(mask, idx < num_sends)) {
+  while (rpc::ballot(mask, idx < num_sends)) {
     send([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
                          ? sizeof(Buffer::data)
@@ -499,7 +499,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
   uint64_t mask = process.header[index].mask;
-  while (gpu::ballot(mask, idx < num_recvs)) {
+  while (rpc::ballot(mask, idx < num_recvs)) {
     recv([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
                          ? sizeof(Buffer::data)
@@ -520,13 +520,13 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
   // Repeatedly perform a naive linear scan for a port that can be opened to
   // send data.
-  for (uint32_t index = gpu::get_cluster_id();; ++index) {
+  for (uint32_t index = 0;; ++index) {
     // Start from the beginning if we run out of ports to check.
     if (index >= process.port_count)
       index = 0;
 
     // Attempt to acquire the lock on this index.
-    uint64_t lane_mask = gpu::get_lane_mask();
+    uint64_t lane_mask = rpc::get_lane_mask();
     if (!process.try_lock(lane_mask, index))
       continue;
 
@@ -540,12 +540,12 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
       continue;
     }
 
-    if (gpu::is_first_lane(lane_mask)) {
+    if (rpc::is_first_lane(lane_mask)) {
       process.header[index].opcode = opcode;
       process.header[index].mask = lane_mask;
     }
-    gpu::sync_lane(lane_mask);
-    return Port(process, lane_mask, gpu::get_lane_size(), index, out);
+    rpc::sync_lane(lane_mask);
+    return Port(process, lane_mask, rpc::get_num_lanes(), index, out);
   }
 }
 
@@ -555,7 +555,7 @@ LIBC_INLINE cpp::optional<typename Server::Port>
 Server::try_open(uint32_t lane_size, uint32_t start) {
   // Perform a naive linear scan for a port that has a pending request.
   for (uint32_t index = start; index < process.port_count; ++index) {
-    uint64_t lane_mask = gpu::get_lane_mask();
+    uint64_t lane_mask = rpc::get_lane_mask();
     uint32_t in = process.load_inbox(lane_mask, index);
     uint32_t out = process.load_outbox(lane_mask, index);
 
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index 93b8289617484e..39e5f30b84ac44 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -10,22 +10,87 @@
 #define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
 
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/sleep.h"
 
+#if defined(__NVPTX__) || defined(__AMDGPU__)
+#include <gpuintrin.h>
+#define RPC_TARGET_IS_GPU
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
 namespace rpc {
 
 /// Conditional to indicate if this process is running on the GPU.
 LIBC_INLINE constexpr bool is_process_gpu() {
-#if defined(__NVPTX__) || defined(__AMDGPU__)
+#ifdef RPC_TARGET_IS_GPU
   return true;
 #else
   return false;
 #endif
 }
 
+/// Wait for all lanes in the group to complete.
+LIBC_INLINE void sync_lane(uint64_t lane_mask) {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_sync_lane(lane_mask);
+#endif
+}
+
+/// Copies the value from the first active thread to the rest.
+LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_read_first_lane_u32(lane_mask, x);
+#else
+  return x;
+#endif
+}
+
+/// Returns the number lanes that participate in the RPC interface.
+LIBC_INLINE uint32_t get_num_lanes() {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_num_lanes();
+#else
+  return 1;
+#endif
+}
+
+/// Returns the id of the thread inside of an AMD wavefront executing together.
+LIBC_INLINE uint64_t get_lane_mask() {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_lane_mask();
+#else
+  return 1;
+#endif
+}
+
+/// Returns the id of the thread inside of an AMD wavefront executing together.
+LIBC_INLINE uint32_t get_lane_id() {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_lane_id();
+#else
+  return 0;
+#endif
+}
+
+/// Conditional that is only true for a single thread in a lane.
+LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_is_first_in_lane(lane_mask);
+#else
+  return true;
+#endif
+}
+
+/// Returns a bitmask of threads in the current lane for which \p x is true.
+LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+#ifdef RPC_TARGET_IS_GPU
+  return __gpu_ballot(lane_mask, x);
+#else
+  return x;
+#endif
+}
+
 /// Return \p val aligned "upwards" according to \p align.
 template <typename V, typename A>
 LIBC_INLINE constexpr V align_up(V val, A align) {

>From 7dc40479085791d4fa46f0b5b91a8d2073364bd0 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 15 Nov 2024 22:24:56 -0600
Subject: [PATCH 2/4] Put convergent back

---
 libc/src/__support/RPC/rpc.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 1482230d73c7c1..d0f653c384e166 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -164,7 +164,8 @@ template <bool Invert> struct Process {
   /// lane_mask is a bitmap of the threads in the warp that would hold the
   /// single lock on success, e.g. the result of rpc::get_lane_mask()
   /// The lock is held when the n-th bit of the lock bitfield is set.
-  LIBC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) {
+  [[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
+                                                  uint32_t index) {
     // On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
     // On volta, need to handle differences between the threads running and
     // the threads that were detected in the previous call to get_lane_mask()
@@ -204,7 +205,8 @@ template <bool Invert> struct Process {
 
   /// Unlock the lock at index. We need a lane sync to keep this function
   /// convergent, otherwise the compiler will sink the store and deadlock.
-  LIBC_INLINE void unlock(uint64_t lane_mask, uint32_t index) {
+  [[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
+                                                uint32_t index) {
     // Do not move any writes past the unlock.
     __atomic_thread_fence(__ATOMIC_RELEASE);
 
@@ -517,7 +519,8 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 /// port. Each port instance uses an associated \p opcode to tell the server
 /// what to do. The Client interface provides the appropriate lane size to the
 /// port using the platform's returned value.
-template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
+template <uint16_t opcode>
+[[clang::convergent]] LIBC_INLINE Client::Port Client::open() {
   // Repeatedly perform a naive linear scan for a port that can be opened to
   // send data.
   for (uint32_t index = 0;; ++index) {
@@ -551,7 +554,7 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
 
 /// Attempts to open a port to use as the server. The server can only open a
 /// port if it has a pending receive operation
-LIBC_INLINE cpp::optional<typename Server::Port>
+[[clang::convergent]] LIBC_INLINE cpp::optional<typename Server::Port>
 Server::try_open(uint32_t lane_size, uint32_t start) {
   // Perform a naive linear scan for a port that has a pending request.
   for (uint32_t index = start; index < process.port_count; ++index) {

>From 606d355f8f8d1bd3f3c1de3853e18b5e273352bf Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 18 Nov 2024 13:34:02 -0600
Subject: [PATCH 3/4] Remove header and fix transitive uses

---
 libc/src/__support/OSUtil/gpu/exit.cpp | 1 +
 libc/src/__support/RPC/rpc.h           | 1 -
 libc/src/stdio/gpu/vfprintf_utils.h    | 1 +
 libc/src/stdlib/gpu/abort.cpp          | 1 +
 4 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/OSUtil/gpu/exit.cpp b/libc/src/__support/OSUtil/gpu/exit.cpp
index 8aaa41b4e3eefc..0cb266a42d180a 100644
--- a/libc/src/__support/OSUtil/gpu/exit.cpp
+++ b/libc/src/__support/OSUtil/gpu/exit.cpp
@@ -8,6 +8,7 @@
 
 #include "src/__support/OSUtil/exit.h"
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index d0f653c384e166..9fb4c061f39201 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -20,7 +20,6 @@
 
 #include "rpc_util.h"
 #include "src/__support/CPP/optional.h"
-#include "src/__support/GPU/utils.h"
 #include "src/__support/macros/config.h"
 
 #include <stdint.h>
diff --git a/libc/src/stdio/gpu/vfprintf_utils.h b/libc/src/stdio/gpu/vfprintf_utils.h
index 5010ee16d96074..409775f3f33cc8 100644
--- a/libc/src/stdio/gpu/vfprintf_utils.h
+++ b/libc/src/stdio/gpu/vfprintf_utils.h
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/types/FILE.h"
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/stdlib/gpu/abort.cpp b/libc/src/stdlib/gpu/abort.cpp
index cfc7e9b8e228ba..3a06fb38c3f64f 100644
--- a/libc/src/stdlib/gpu/abort.cpp
+++ b/libc/src/stdlib/gpu/abort.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"

>From a2a27c3d825958f335a40308b2b7aff4829b77ed Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 18 Nov 2024 15:29:34 -0600
Subject: [PATCH 4/4] [libc] Pull last dependencies into rpc_util.h

Summary:
Last bit in-place to remove the dependencies on LLVM libc headers. This
just pulls the `sleep_briefly`, `std::optinal` and `type_traits` definitions into the
`rpc_util.h` header. This duplicates some code for now but will soon be
moved into the `include/rpc` directory. At that point I will remove all
the `LIBC_INLINE` and just make it `RPC_INLINE`. Internal use will then
have a wrapper to make it all LIBC namespaced, implementations will then
implement their own handling.
---
 libc/src/__support/RPC/rpc.h        |  17 ++--
 libc/src/__support/RPC/rpc_client.h |   5 +
 libc/src/__support/RPC/rpc_util.h   | 148 +++++++++++++++++++++++++++-
 3 files changed, 157 insertions(+), 13 deletions(-)

diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 9fb4c061f39201..be7686b2a2fe59 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -19,7 +19,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
 
 #include "rpc_util.h"
-#include "src/__support/CPP/optional.h"
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 
 #include <stdint.h>
@@ -303,7 +303,7 @@ template <bool T> struct Port {
 
   friend struct Client;
   friend struct Server;
-  friend class cpp::optional<Port<T>>;
+  friend class rpc::optional<Port<T>>;
 
 public:
   template <typename U> LIBC_INLINE void recv(U use);
@@ -359,9 +359,6 @@ struct Client {
 private:
   Process<false> process;
 };
-static_assert(cpp::is_trivially_copyable<Client>::value &&
-                  sizeof(Process<true>) == sizeof(Process<false>),
-              "The client is not trivially copyable from the server");
 
 /// The RPC server used to respond to the client.
 struct Server {
@@ -374,7 +371,7 @@ struct Server {
       : process(port_count, buffer) {}
 
   using Port = rpc::Port<true>;
-  LIBC_INLINE cpp::optional<Port> try_open(uint32_t lane_size,
+  LIBC_INLINE rpc::optional<Port> try_open(uint32_t lane_size,
                                            uint32_t start = 0);
   LIBC_INLINE Port open(uint32_t lane_size);
 
@@ -553,7 +550,7 @@ template <uint16_t opcode>
 
 /// Attempts to open a port to use as the server. The server can only open a
 /// port if it has a pending receive operation
-[[clang::convergent]] LIBC_INLINE cpp::optional<typename Server::Port>
+[[clang::convergent]] LIBC_INLINE rpc::optional<typename Server::Port>
 Server::try_open(uint32_t lane_size, uint32_t start) {
   // Perform a naive linear scan for a port that has a pending request.
   for (uint32_t index = start; index < process.port_count; ++index) {
@@ -580,13 +577,13 @@ Server::try_open(uint32_t lane_size, uint32_t start) {
 
     return Port(process, lane_mask, lane_size, index, out);
   }
-  return cpp::nullopt;
+  return rpc::nullopt;
 }
 
 LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
   for (;;) {
-    if (cpp::optional<Server::Port> p = try_open(lane_size))
-      return cpp::move(p.value());
+    if (rpc::optional<Server::Port> p = try_open(lane_size))
+      return rpc::move(p.value());
     sleep_briefly();
   }
 }
diff --git a/libc/src/__support/RPC/rpc_client.h b/libc/src/__support/RPC/rpc_client.h
index 695b6b7515bf7b..7bd6d0b5e00b47 100644
--- a/libc/src/__support/RPC/rpc_client.h
+++ b/libc/src/__support/RPC/rpc_client.h
@@ -12,11 +12,16 @@
 #include "rpc.h"
 
 #include "include/llvm-libc-types/rpc_opcodes_t.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace rpc {
 
+static_assert(cpp::is_trivially_copyable<Client>::value &&
+                  sizeof(Process<true>) == sizeof(Process<false>),
+              "The client is not trivially copyable from the server");
+
 /// The libc client instance used to communicate with the server.
 extern Client client;
 
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index 39e5f30b84ac44..aa23549d4b8c2e 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -9,9 +9,11 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
 #define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
 
-#include "src/__support/CPP/type_traits.h"
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/threads/sleep.h"
+
+#include <stddef.h>
+#include <stdint.h>
 
 #if defined(__NVPTX__) || defined(__AMDGPU__)
 #include <gpuintrin.h>
@@ -21,6 +23,146 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace rpc {
 
+template <typename T> struct type_identity {
+  using type = T;
+};
+
+template <class T, T v> struct type_constant {
+  static inline constexpr T value = v;
+};
+
+template <class T> struct remove_reference : type_identity<T> {};
+template <class T> struct remove_reference<T &> : type_identity<T> {};
+template <class T> struct remove_reference<T &&> : type_identity<T> {};
+
+template <class T> struct is_const : type_constant<bool, false> {};
+template <class T> struct is_const<const T> : type_constant<bool, true> {};
+
+/// Freestanding implementation of std::move.
+template <class T>
+LIBC_INLINE constexpr typename remove_reference<T>::type &&move(T &&t) {
+  return static_cast<typename remove_reference<T>::type &&>(t);
+}
+
+/// Freestanding implementation of std::forward.
+template <typename T>
+LIBC_INLINE constexpr T &&forward(typename remove_reference<T>::type &value) {
+  return static_cast<T &&>(value);
+}
+template <typename T>
+LIBC_INLINE constexpr T &&forward(typename remove_reference<T>::type &&value) {
+  return static_cast<T &&>(value);
+}
+
+struct in_place_t {
+  LIBC_INLINE explicit in_place_t() = default;
+};
+
+struct nullopt_t {
+  LIBC_INLINE constexpr explicit nullopt_t() = default;
+};
+
+constexpr inline in_place_t in_place{};
+constexpr inline nullopt_t nullopt{};
+
+/// Freestanding and minimal implementation of std::optional.
+template <typename T> class optional {
+  template <typename U> struct OptionalStorage {
+    union {
+      char empty;
+      U stored_value;
+    };
+
+    bool in_use = false;
+
+    LIBC_INLINE ~OptionalStorage() { reset(); }
+
+    LIBC_INLINE constexpr OptionalStorage() : empty() {}
+
+    template <typename... Args>
+    LIBC_INLINE constexpr explicit OptionalStorage(in_place_t, Args &&...args)
+        : stored_value(forward<Args>(args)...) {}
+
+    LIBC_INLINE constexpr void reset() {
+      if (in_use)
+        stored_value.~U();
+      in_use = false;
+    }
+  };
+
+  OptionalStorage<T> storage;
+
+public:
+  LIBC_INLINE constexpr optional() = default;
+  LIBC_INLINE constexpr optional(nullopt_t) {}
+
+  LIBC_INLINE constexpr optional(const T &t) : storage(in_place, t) {
+    storage.in_use = true;
+  }
+  LIBC_INLINE constexpr optional(const optional &) = default;
+
+  LIBC_INLINE constexpr optional(T &&t) : storage(in_place, move(t)) {
+    storage.in_use = true;
+  }
+  LIBC_INLINE constexpr optional(optional &&O) = default;
+
+  LIBC_INLINE constexpr optional &operator=(T &&t) {
+    storage = move(t);
+    return *this;
+  }
+  LIBC_INLINE constexpr optional &operator=(optional &&) = default;
+
+  LIBC_INLINE constexpr optional &operator=(const T &t) {
+    storage = t;
+    return *this;
+  }
+  LIBC_INLINE constexpr optional &operator=(const optional &) = default;
+
+  LIBC_INLINE constexpr void reset() { storage.reset(); }
+
+  LIBC_INLINE constexpr const T &value() const & {
+    return storage.stored_value;
+  }
+
+  LIBC_INLINE constexpr T &value() & { return storage.stored_value; }
+
+  LIBC_INLINE constexpr explicit operator bool() const {
+    return storage.in_use;
+  }
+  LIBC_INLINE constexpr bool has_value() const { return storage.in_use; }
+  LIBC_INLINE constexpr const T *operator->() const {
+    return &storage.stored_value;
+  }
+  LIBC_INLINE constexpr T *operator->() { return &storage.stored_value; }
+  LIBC_INLINE constexpr const T &operator*() const & {
+    return storage.stored_value;
+  }
+  LIBC_INLINE constexpr T &operator*() & { return storage.stored_value; }
+
+  LIBC_INLINE constexpr T &&value() && { return move(storage.stored_value); }
+  LIBC_INLINE constexpr T &&operator*() && {
+    return move(storage.stored_value);
+  }
+};
+
+/// Suspend the thread briefly to assist the thread scheduler during busy loops.
+LIBC_INLINE void sleep_briefly() {
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+    LIBC_INLINE_ASM("nanosleep.u32 64;" :: : "memory");
+#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+  __builtin_amdgcn_s_sleep(2);
+#elif defined(LIBC_TARGET_ARCH_IS_X86)
+  __builtin_ia32_pause();
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && __has_builtin(__builtin_arm_isb)
+  __builtin_arm_isb(0xf);
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
+  asm volatile("isb\n" ::: "memory");
+#else
+  // Simply do nothing if sleeping isn't supported on this platform.
+#endif
+}
+
 /// Conditional to indicate if this process is running on the GPU.
 LIBC_INLINE constexpr bool is_process_gpu() {
 #ifdef RPC_TARGET_IS_GPU
@@ -109,7 +251,7 @@ template <typename V> LIBC_INLINE V &lane_value(V *val, uint32_t id) {
 
 /// Advance the \p p by \p bytes.
 template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
-  if constexpr (cpp::is_const_v<T>)
+  if constexpr (is_const<T>::value)
     return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) +
                                  bytes);
   else