[libc-commits] [libc] [libc] Partially implement 'rand' for the GPU (PR #66167)

Joseph Huber via libc-commits libc-commits at lists.llvm.org
Tue Sep 12 20:10:52 PDT 2023


https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/66167:

Summary:
This patch partially implements the `rand` function on the GPU. This is
partial because the GPU currently doesn't support thread local storage
or static initializers. To implement this on the GPU. I use 1/8th of the
local / shared memory quota to treak the shared memory as thread local
storage. This is done by simply allocating enough storage for each
thread in the block and indexing into this based off of the thread id.
The downside to this is that it does not initialize `srand` correctly to
be `1` as the standard says, it is also wasteful. In the future we
should figure out a way to support TLS on the GPU so that this can be
completely common and less resource intensive.


>From 42dbe20b74f10d8a9db569d02f1fb8bd48a4e295 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6 at vols.utk.edu>
Date: Tue, 12 Sep 2023 18:13:22 -0500
Subject: [PATCH] [libc] Partially implement 'rand' for the GPU

Summary:
This patch partially implements the `rand` function on the GPU. This is
partial because the GPU currently doesn't support thread local storage
or static initializers. To implement this on the GPU. I use 1/8th of the
local / shared memory quota to treak the shared memory as thread local
storage. This is done by simply allocating enough storage for each
thread in the block and indexing into this based off of the thread id.
The downside to this is that it does not initialize `srand` correctly to
be `1` as the standard says, it is also wasteful. In the future we
should figure out a way to support TLS on the GPU so that this can be
completely common and less resource intensive.
---
 libc/config/gpu/entrypoints.txt       |  2 ++
 libc/src/__support/GPU/amdgpu/utils.h | 10 ++++++----
 libc/src/stdlib/rand.cpp              | 10 ++++++----
 libc/src/stdlib/rand_util.cpp         |  6 ++++++
 libc/src/stdlib/rand_util.h           | 22 ++++++++++++++++++++++
 libc/test/src/stdlib/rand_test.cpp    |  3 +++
 6 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 0e314c60870c6ae..532c57d27718eb3 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -61,6 +61,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.ldiv
     libc.src.stdlib.llabs
     libc.src.stdlib.lldiv
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
     libc.src.stdlib.strtod
     libc.src.stdlib.strtof
     libc.src.stdlib.strtol
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index f2fa84bc4259fb6..c23c9a2e2d985f8 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -21,10 +21,12 @@ namespace gpu {
 constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
 
 /// Type aliases to the address spaces used by the AMDGPU backend.
-template <typename T> using Private = [[clang::opencl_private]] T;
-template <typename T> using Constant = [[clang::opencl_constant]] T;
-template <typename T> using Local = [[clang::opencl_local]] T;
-template <typename T> using Global = [[clang::opencl_global]] T;
+// FIXME: The OpenCL attributes are currently buggy on AMDGPU compared to the
+// direct address space attributes.
+template <typename T> using Private = [[clang::address_space(5)]] T;
+template <typename T> using Constant = [[clang::address_space(4)]] T;
+template <typename T> using Local = [[clang::address_space(3)]] T;
+template <typename T> using Global = [[clang::address_space(1)]] T;
 
 /// Returns the number of workgroups in the 'x' dimension of the grid.
 LIBC_INLINE uint32_t get_num_blocks_x() {
diff --git a/libc/src/stdlib/rand.cpp b/libc/src/stdlib/rand.cpp
index 771944f8b336fb7..038c0e082409e12 100644
--- a/libc/src/stdlib/rand.cpp
+++ b/libc/src/stdlib/rand.cpp
@@ -15,10 +15,12 @@ namespace __llvm_libc {
 // An implementation of the xorshift64star pseudo random number generator. This
 // is a good general purpose generator for most non-cryptographics applications.
 LLVM_LIBC_FUNCTION(int, rand, (void)) {
-  rand_next ^= rand_next >> 12;
-  rand_next ^= rand_next << 25;
-  rand_next ^= rand_next >> 27;
-  return static_cast<int>((rand_next * 0x2545F4914F6CDD1Dul) >> 32) & RAND_MAX;
+  unsigned long x = rand_next;
+  x ^= x >> 12;
+  x ^= x << 25;
+  x ^= x >> 27;
+  rand_next = x;
+  return static_cast<int>((x * 0x2545F4914F6CDD1Dul) >> 32) & RAND_MAX;
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/stdlib/rand_util.cpp b/libc/src/stdlib/rand_util.cpp
index dac8dca2804e1c2..6e7ec85501af785 100644
--- a/libc/src/stdlib/rand_util.cpp
+++ b/libc/src/stdlib/rand_util.cpp
@@ -11,8 +11,14 @@
 
 namespace __llvm_libc {
 
+#ifdef LIBC_TARGET_ARCH_IS_GPU
+// FIXME: Local GPU memory cannot be initialized so we cannot currently provide
+// a standard compliant default value.
+ThreadLocal<unsigned long> rand_next;
+#else
 // C standard 7.10p2: If 'rand' is called before 'srand' it is to proceed as if
 // the 'srand' function was called with a value of '1'.
 LIBC_THREAD_LOCAL unsigned long rand_next = 1;
+#endif
 
 } // namespace __llvm_libc
diff --git a/libc/src/stdlib/rand_util.h b/libc/src/stdlib/rand_util.h
index 4cbfa1478ef7bb2..281ce8a9a65a6fa 100644
--- a/libc/src/stdlib/rand_util.h
+++ b/libc/src/stdlib/rand_util.h
@@ -9,11 +9,33 @@
 #ifndef LLVM_LIBC_SRC_STDLIB_RAND_UTIL_H
 #define LLVM_LIBC_SRC_STDLIB_RAND_UTIL_H
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/macros/attributes.h"
 
 namespace __llvm_libc {
 
+#ifdef LIBC_TARGET_ARCH_IS_GPU
+// Implement thread local storage on the GPU using local memory. Each thread
+// gets its slot in the local memory array and is private to the group.
+// TODO: We need to implement the 'thread_local' keyword on the GPU. This is an
+// inefficient and incomplete stand-in until that is done.
+template <typename T> class ThreadLocal {
+private:
+  static constexpr long MAX_THREADS = 1024;
+  [[clang::loader_uninitialized]] static inline gpu::Local<T>
+      storage[MAX_THREADS];
+
+public:
+  LIBC_INLINE operator T() const { return storage[gpu::get_thread_id()]; }
+  LIBC_INLINE void operator=(const T &value) {
+    storage[gpu::get_thread_id()] = value;
+  }
+};
+
+extern ThreadLocal<unsigned long> rand_next;
+#else
 extern LIBC_THREAD_LOCAL unsigned long rand_next;
+#endif
 
 } // namespace __llvm_libc
 
diff --git a/libc/test/src/stdlib/rand_test.cpp b/libc/test/src/stdlib/rand_test.cpp
index 4bebbe37ffbe114..771b0e0f5975911 100644
--- a/libc/test/src/stdlib/rand_test.cpp
+++ b/libc/test/src/stdlib/rand_test.cpp
@@ -23,12 +23,15 @@ TEST(LlvmLibcRandTest, UnsetSeed) {
     vals[i] = val;
   }
 
+  // FIXME: The GPU implementation cannot initialize the seed correctly.
+#ifndef LIBC_TARGET_ARCH_IS_GPU
   // The C standard specifies that if 'srand' is never called it should behave
   // as if 'srand' was called with a value of 1. If we seed the value with 1 we
   // should get the same sequence as the unseeded version.
   __llvm_libc::srand(1);
   for (size_t i = 0; i < 1000; ++i)
     ASSERT_EQ(__llvm_libc::rand(), vals[i]);
+#endif
 }
 
 TEST(LlvmLibcRandTest, SetSeed) {



More information about the libc-commits mailing list