[libc-commits] [libc] [libc] Add memory fence utility to the GPU utilities (PR #91756)

Fri May 10 08:19:48 PDT 2024

llvmbot wrote:



@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-libc

Author: Joseph Huber (jhuber6)

<details>
<summary>Changes</summary>

Summary:
GPUs like to execute instructions in the background until something
excplitely consumes them. We are working on adding some
microbenchmarking code, which requires flushing the pending memory
operations beforehand. This patch simply adds these utility functions
that will be used in the near future.


---
Full diff: https://github.com/llvm/llvm-project/pull/91756.diff


2 Files Affected:

- (modified) libc/src/__support/GPU/amdgpu/utils.h (+6) 
- (modified) libc/src/__support/GPU/nvptx/utils.h (+6) 


``````````diff

diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 9b520a6bcf38d..ff30d06f186b5 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -140,6 +140,12 @@ LIBC_INLINE uint32_t get_lane_size() {
   __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 }
 
+/// Waits for all pending memory operations to complete in program order.
+[[clang::convergent]] LIBC_INLINE void memory_fence() {
+  __builtin_amdgcn_s_waitcnt(0);
+  __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
+}
+
 /// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
 [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
   __builtin_amdgcn_wave_barrier();
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index 3f19afb836486..1e94e8af369df 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -118,14 +118,20 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
   uint32_t mask = static_cast<uint32_t>(lane_mask);
   return __nvvm_vote_ballot_sync(mask, x);
 }
+
 /// Waits for all the threads in the block to converge and issues a fence.
 [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
 
+/// Waits for all pending memory operations to complete in program order.
+[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
+
 /// Waits for all threads in the warp to reconverge for independent scheduling.
 [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
   __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
 }
 
+///:
+
 /// Shuffles the the lanes inside the warp according to the given index.
 [[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
                                                    uint32_t idx, uint32_t x) {

``````````

</details>


https://github.com/llvm/llvm-project/pull/91756