[libc-commits] [libc] 1143da2 - [libc][gpu] Thread divergence fix on volta

Thu Aug 31 06:34:12 PDT 2023

Author: Jon Chesterfield
Date: 2023-08-31T14:34:02+01:00
New Revision: 1143da22450a1f8de510ef75613648fb614faee7

URL: https://github.com/llvm/llvm-project/commit/1143da22450a1f8de510ef75613648fb614faee7
DIFF: https://github.com/llvm/llvm-project/commit/1143da22450a1f8de510ef75613648fb614faee7.diff

LOG: [libc][gpu] Thread divergence fix on volta

The inbox/outbox loads are performed by the current warp, not a single thread.

The outbox load indicates whether a port has been successfully opened. If some
lanes in the warp think it has and others think the port open failed, as the
warp happened to be diverged when the load occurred, all the subsequent control
flow will be incorrect.

The inbox load indicates whether the machine on the other side of the RPC channel
has progressed. If lanes in the warp have different ideas about that, some will
try to progress their state transition while others won't. As far as the RPC layer
is concerned this is a performance problem and not a correctness one - none of the lanes
can start the transition early, only miss it and start late - but in practice the calls
layered on top of RPC do not have the interface required to detect this event and retry
the load on the stalled lanes, so the calls layered on top will be broken.

None of this is broken on amdgpu, but it's likely that the readfirstlane will have
beneficial performance properties there. Possible significant enough that it's
worth landing this ahead of fixing gpu::broadcast_value on volta.

Essentially volta wasn't adequately considered when writing this part of the protocol.
It's a bug present in the initial prototype and propagated thus far, because none of
the test cases push volta into a warp diverged state in the middle of the RPC sequence.

We should have some test cases for volta where port_open and equivalent are called
from diverged warps.

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D159276

Added: 
    

Modified: 
    libc/src/__support/GPU/amdgpu/utils.h
    libc/src/__support/GPU/generic/utils.h
    libc/src/__support/GPU/nvptx/utils.h
    libc/src/__support/RPC/rpc.h
    libc/test/src/__support/RPC/rpc_smoke_test.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index b8e193a793c6c3..532a47b4a99058 100644

--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -125,7 +125,8 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 }
 
 /// Copies the value from the first active thread in the wavefront to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,
+                                                           uint32_t x) {
   return __builtin_amdgcn_readfirstlane(x);
 }
 

diff  --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
index ba985afa696ee7..5c7913176b7ce7 100644
--- a/libc/src/__support/GPU/generic/utils.h
+++ b/libc/src/__support/GPU/generic/utils.h
@@ -61,12 +61,9 @@ LIBC_INLINE uint32_t get_lane_id() { return 0; }
 
 LIBC_INLINE uint64_t get_lane_mask() { return 1; }
 
-LIBC_INLINE uint32_t broadcast_value(uint32_t x) { return x; }
+LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; }
 
-LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  (void)lane_mask;
-  return x;
-}
+LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; }
 
 LIBC_INLINE void sync_threads() {}
 

diff  --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index 20b247e98605ae..5c6cf3a3a8e12c 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -111,14 +111,12 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 }
 
 /// Copies the value from the first active thread in the warp to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint32_t x) {
-  // NOTE: This is not sufficient in all cases on Volta hardware or later. The
-  // lane mask returned here is not always the true lane mask used by the
-  // intrinsics in cases of incedental or enforced divergence by the user.
-  uint32_t lane_mask = static_cast<uint32_t>(get_lane_mask());
-  uint32_t id = __builtin_ffs(lane_mask) - 1;
+[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
+                                                           uint32_t x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
+  uint32_t id = __builtin_ffs(mask) - 1;
 #if __CUDA_ARCH__ >= 600
-  return __nvvm_shfl_sync_idx_i32(lane_mask, x, id, get_lane_size() - 1);
+  return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
 #else
   return __nvvm_shfl_idx_i32(x, id, get_lane_size() - 1);
 #endif
@@ -126,10 +124,11 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
 
 /// Returns a bitmask of threads in the current lane for which \p x is true.
 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+  uint32_t mask = static_cast<uint32_t>(lane_mask);
 #if __CUDA_ARCH__ >= 600
-  return __nvvm_vote_ballot_sync(static_cast<uint32_t>(lane_mask), x);
+  return __nvvm_vote_ballot_sync(mask, x);
 #else
-  return static_cast<uint32_t>(lane_mask) & __nvvm_vote_ballot(x);
+  return mask & __nvvm_vote_ballot(x);
 #endif
 }
 /// Waits for all the threads in the block to converge and issues a fence.

diff  --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 519faa0d00ecd7..49336fbc0332d3 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -116,13 +116,15 @@ template <bool Invert, typename Packet> struct Process {
   }
 
   /// Retrieve the inbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_inbox(uint32_t index) {
-    return inbox[index].load(cpp::MemoryOrder::RELAXED);
+  LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) {
+    return gpu::broadcast_value(lane_mask,
+                                inbox[index].load(cpp::MemoryOrder::RELAXED));
   }
 
   /// Retrieve the outbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_outbox(uint32_t index) {
-    return outbox[index].load(cpp::MemoryOrder::RELAXED);
+  LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) {
+    return gpu::broadcast_value(lane_mask,
+                                outbox[index].load(cpp::MemoryOrder::RELAXED));
   }
 
   /// Signal to the other process that this one is finished with the buffer.
@@ -138,11 +140,11 @@ template <bool Invert, typename Packet> struct Process {
 
   // Given the current outbox and inbox values, wait until the inbox changes
   // to indicate that this thread owns the buffer element.
-  LIBC_INLINE void wait_for_ownership(uint32_t index, uint32_t outbox,
-                                      uint32_t in) {
+  LIBC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index,
+                                      uint32_t outbox, uint32_t in) {
     while (buffer_unavailable(in, outbox)) {
       sleep_briefly();
-      in = load_inbox(index);
+      in = load_inbox(lane_mask, index);
     }
     atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
   }
@@ -393,10 +395,10 @@ template <uint32_t lane_size> struct Server {
 template <bool T, typename S>
 template <typename F>
 LIBC_INLINE void Port<T, S>::send(F fill) {
-  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);
+  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
 
   // We need to wait until we own the buffer before sending.
-  process.wait_for_ownership(index, out, in);
+  process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p fill function to initialize the buffer and release the memory.
   invoke_rpc(fill, process.packet[index]);
@@ -416,10 +418,10 @@ LIBC_INLINE void Port<T, S>::recv(U use) {
     owns_buffer = false;
   }
 
-  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(index);
+  uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
 
   // We need to wait until we own the buffer before receiving.
-  process.wait_for_ownership(index, out, in);
+  process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p use function to read the memory out of the buffer.
   invoke_rpc(use, process.packet[index]);
@@ -534,8 +536,8 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
     if (!process.try_lock(lane_mask, index))
       continue;
 
-    uint32_t in = process.load_inbox(index);
-    uint32_t out = process.load_outbox(index);
+    uint32_t in = process.load_inbox(lane_mask, index);
+    uint32_t out = process.load_outbox(lane_mask, index);
 
     // Once we acquire the index we need to check if we are in a valid sending
     // state.
@@ -561,8 +563,9 @@ template <uint32_t lane_size>
     Server<lane_size>::try_open() {
   // Perform a naive linear scan for a port that has a pending request.
   for (uint32_t index = 0; index < process.port_count; ++index) {
-    uint32_t in = process.load_inbox(index);
-    uint32_t out = process.load_outbox(index);
+    uint64_t lane_mask = gpu::get_lane_mask();
+    uint32_t in = process.load_inbox(lane_mask, index);
+    uint32_t out = process.load_outbox(lane_mask, index);
 
     // The server is passive, if there is no work pending don't bother
     // opening a port.
@@ -570,12 +573,11 @@ template <uint32_t lane_size>
       continue;
 
     // Attempt to acquire the lock on this index.
-    uint64_t lane_mask = gpu::get_lane_mask();
     if (!process.try_lock(lane_mask, index))
       continue;
 
-    in = process.load_inbox(index);
-    out = process.load_outbox(index);
+    in = process.load_inbox(lane_mask, index);
+    out = process.load_outbox(lane_mask, index);
 
     if (process.buffer_unavailable(in, out)) {
       process.unlock(lane_mask, index);

diff  --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
index 05a34c340ca614..587ca8eb111f23 100644
--- a/libc/test/src/__support/RPC/rpc_smoke_test.cpp
+++ b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
@@ -49,36 +49,37 @@ TEST(LlvmLibcRPCSmoke, SanityCheck) {
   EXPECT_TRUE(ProcB.try_lock(lane_mask, index));
 
   // All zero to begin with
-  EXPECT_EQ(ProcA.load_inbox(index), 0u);
-  EXPECT_EQ(ProcB.load_inbox(index), 0u);
-  EXPECT_EQ(ProcA.load_outbox(index), 0u);
-  EXPECT_EQ(ProcB.load_outbox(index), 0u);
+  EXPECT_EQ(ProcA.load_inbox(lane_mask, index), 0u);
+  EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 0u);
+  EXPECT_EQ(ProcA.load_outbox(lane_mask, index), 0u);
+  EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);
 
   // Available for ProcA and not for ProcB
-  EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(index),
-                                        ProcA.load_outbox(index)));
-  EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(index),
-                                       ProcB.load_outbox(index)));
+  EXPECT_FALSE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
+                                        ProcA.load_outbox(lane_mask, index)));
+  EXPECT_TRUE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
+                                       ProcB.load_outbox(lane_mask, index)));
 
   // ProcA write to outbox
-  uint32_t ProcAOutbox = ProcA.load_outbox(index);
+  uint32_t ProcAOutbox = ProcA.load_outbox(lane_mask, index);
   EXPECT_EQ(ProcAOutbox, 0u);
   ProcAOutbox = ProcA.invert_outbox(index, ProcAOutbox);
   EXPECT_EQ(ProcAOutbox, 1u);
 
   // No longer available for ProcA
-  EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(index), ProcAOutbox));
+  EXPECT_TRUE(ProcA.buffer_unavailable(ProcA.load_inbox(lane_mask, index),
+                                       ProcAOutbox));
 
   // Outbox is still zero, hasn't been written to
-  EXPECT_EQ(ProcB.load_outbox(index), 0u);
+  EXPECT_EQ(ProcB.load_outbox(lane_mask, index), 0u);
 
   // Wait for ownership will terminate because load_inbox returns 1
-  EXPECT_EQ(ProcB.load_inbox(index), 1u);
-  ProcB.wait_for_ownership(index, 0u, 0u);
+  EXPECT_EQ(ProcB.load_inbox(lane_mask, index), 1u);
+  ProcB.wait_for_ownership(lane_mask, index, 0u, 0u);
 
   // and B now has the buffer available
-  EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(index),
-                                        ProcB.load_outbox(index)));
+  EXPECT_FALSE(ProcB.buffer_unavailable(ProcB.load_inbox(lane_mask, index),
+                                        ProcB.load_outbox(lane_mask, index)));
 
   // Enough checks for one test, close the locks
   ProcA.unlock(lane_mask, index);