[libc-commits] [libc] 979fb95 - Revert "[libc] Treat the locks array as a bitfield"

Wed Jul 19 07:27:21 PDT 2023

Author: Joseph Huber
Date: 2023-07-19T09:27:08-05:00
New Revision: 979fb95021c81024c7684e0572b6899cf5a28445

URL: https://github.com/llvm/llvm-project/commit/979fb95021c81024c7684e0572b6899cf5a28445
DIFF: https://github.com/llvm/llvm-project/commit/979fb95021c81024c7684e0572b6899cf5a28445.diff

LOG: Revert "[libc] Treat the locks array as a bitfield"

Summary:
This caused test failures on the gfx90a buildbot. This works on my
gfx1030 and the Nvidia buildbots, so we'll need to investigate what is
going wrong here. For now revert it to get the bots green.

This reverts commit 05abcc579244b68162b847a6780d27b22bd58f74.

Added: 
    

Modified: 
    libc/src/__support/RPC/rpc.h
    libc/startup/gpu/amdgpu/start.cpp
    libc/startup/gpu/nvptx/start.cpp
    libc/utils/gpu/server/rpc_server.cpp
    libc/utils/gpu/server/rpc_server.h

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index a252002f2bcc54..d91a5c8b8dba14 100644

--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -56,8 +56,11 @@ template <uint32_t lane_size = gpu::LANE_SIZE> struct alignas(64) Packet {
   Payload<lane_size> payload;
 };
 
-/// The maximum number of parallel ports that the RPC interface can support.
-constexpr uint64_t MAX_PORT_COUNT = 512;
+// TODO: This should be configured by the server and passed in. The general rule
+//       of thumb is that you should have at least as many ports as possible
+//       concurrent work items on the GPU to mitigate the lack offorward
+//       progress guarantees on the GPU.
+constexpr uint64_t DEFAULT_PORT_COUNT = 64;
 
 /// A common process used to synchronize communication between a client and a
 /// server. The process contains a read-only inbox and a write-only outbox used
@@ -84,8 +87,7 @@ template <bool Invert, typename Packet> struct Process {
   cpp::Atomic<uint32_t> *outbox;
   Packet *packet;
 
-  static constexpr uint64_t NUM_BITS_IN_WORD = (sizeof(uint32_t) * 8);
-  cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
+  cpp::Atomic<uint32_t> lock[DEFAULT_PORT_COUNT] = {0};
 
   /// Initialize the communication channels.
   LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
@@ -156,9 +158,10 @@ template <bool Invert, typename Packet> struct Process {
 
   /// Attempt to claim the lock at index. Return true on lock taken.
   /// lane_mask is a bitmap of the threads in the warp that would hold the
-  /// single lock on success, e.g. the result of gpu::get_lane_mask().
-  /// The lock is held when the nth bit of the lock bitfield is set, otherwise
-  /// it is availible.
+  /// single lock on success, e.g. the result of gpu::get_lane_mask()
+  /// The lock is held when the zeroth bit of the uint32_t at lock[index]
+  /// is set, and available when that bit is clear. Bits [1, 32) are zero.
+  /// Or with one is a no-op when the lock is already held.
   [[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
                                                   uint64_t index) {
     // On amdgpu, test and set to lock[index] and a sync_lane would suffice
@@ -170,10 +173,11 @@ template <bool Invert, typename Packet> struct Process {
     // succeed in taking the lock, as otherwise it will leak. This is handled
     // by making threads which are not in lane_mask or with 0, a no-op.
     uint32_t id = gpu::get_lane_id();
-    bool id_in_lane_mask = lane_mask & (1u << id);
+    bool id_in_lane_mask = lane_mask & (1ul << id);
 
     // All threads in the warp call fetch_or. Possibly at the same time.
-    bool before = set_nth(lock, index, id_in_lane_mask);
+    bool before =
+        lock[index].fetch_or(id_in_lane_mask, cpp::MemoryOrder::RELAXED);
     uint64_t packed = gpu::ballot(lane_mask, before);
 
     // If every bit set in lane_mask is also set in packed, every single thread
@@ -208,11 +212,12 @@ template <bool Invert, typename Packet> struct Process {
     // Wait for other threads in the warp to finish using the lock
     gpu::sync_lane(lane_mask);
 
-    // Use exactly one thread to clear the associated lock bit. Must restrict
-    // to a single thread to avoid one thread dropping the lock, then an
-    // unrelated warp claiming the lock, then a second thread in this warp
-    // dropping the lock again.
-    clear_nth(lock, index, rpc::is_first_lane(lane_mask));
+    // Use exactly one thread to clear the bit at position 0 in lock[index]
+    // Must restrict to a single thread to avoid one thread dropping the lock,
+    // then an unrelated warp claiming the lock, then a second thread in this
+    // warp dropping the lock again.
+    uint32_t and_mask = ~(rpc::is_first_lane(lane_mask) ? 1 : 0);
+    lock[index].fetch_and(and_mask, cpp::MemoryOrder::RELAXED);
     gpu::sync_lane(lane_mask);
   }
 
@@ -240,26 +245,6 @@ template <bool Invert, typename Packet> struct Process {
   LIBC_INLINE static constexpr uint64_t buffer_offset(uint64_t port_count) {
     return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
   }
-
-  /// Conditionally set the n-th bit in the atomic bitfield.
-  LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
-                                                uint64_t index, bool cond) {
-    const uint32_t slot = index / NUM_BITS_IN_WORD;
-    const uint32_t bit = index % NUM_BITS_IN_WORD;
-    return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
-                               cpp::MemoryOrder::RELAXED) &
-           (1u << bit);
-  }
-
-  /// Conditionally clear the n-th bit in the atomic bitfield.
-  LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
-                                                  uint64_t index, bool cond) {
-    const uint32_t slot = index / NUM_BITS_IN_WORD;
-    const uint32_t bit = index % NUM_BITS_IN_WORD;
-    return bits[slot].fetch_and(-1u ^ (static_cast<uint32_t>(cond) << bit),
-                                cpp::MemoryOrder::RELAXED) &
-           (1u << bit);
-  }
 };
 
 /// Invokes a function accross every active buffer across the total lane size.

diff  --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index b2adb1d3abcaf6..7081ae6d7d0164 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -47,7 +47,7 @@ extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
   // We need to set up the RPC client first in case any of the constructors
   // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
+  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
                                  rpc_shared_buffer);
 
   // We want the fini array callbacks to be run after other atexit

diff  --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index cd442394e74ce8..90f00496a6b2d8 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -45,7 +45,7 @@ extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
   // We need to set up the RPC client first in case any of the constructors
   // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
+  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::DEFAULT_PORT_COUNT,
                                  rpc_shared_buffer);
 
   // We want the fini array callbacks to be run after other atexit

diff  --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 52c202e193d4fc..da9f50603f1181 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -23,7 +23,7 @@ using namespace __llvm_libc;
 static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer),
               "Buffer size mismatch");
 
-static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
+static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::DEFAULT_PORT_COUNT,
               "Incorrect maximum port count");
 
 // The client needs to support 
diff erent lane sizes for the SIMT model. Because

diff  --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h
index 3c9ba1db668492..556f4e9ee84f95 100644
--- a/libc/utils/gpu/server/rpc_server.h
+++ b/libc/utils/gpu/server/rpc_server.h
@@ -18,7 +18,7 @@ extern "C" {
 #endif
 
 /// The maxium number of ports that can be opened for any server.
-const uint64_t RPC_MAXIMUM_PORT_COUNT = 512;
+const uint64_t RPC_MAXIMUM_PORT_COUNT = 64;
 
 /// The symbol name associated with the client for use with the LLVM C library
 /// implementation.