[libc-commits] [libc] ac604b2 - [libc] Switch to using the generic `<gpuintrin.h>` implementations (#121810)

Tue Jan 7 11:08:43 PST 2025

Author: Joseph Huber
Date: 2025-01-07T13:08:39-06:00
New Revision: ac604b2fa6ff0344a555954069721c0db7b874f9

URL: https://github.com/llvm/llvm-project/commit/ac604b2fa6ff0344a555954069721c0db7b874f9
DIFF: https://github.com/llvm/llvm-project/commit/ac604b2fa6ff0344a555954069721c0db7b874f9.diff

LOG: [libc] Switch to using the generic `<gpuintrin.h>` implementations (#121810)

Summary:
This patch switches the GPU utility helpers to wrapping around the
gpuintrin.h ones with a C++ flavor.

Added: 
    

Modified: 
    libc/src/__support/GPU/CMakeLists.txt
    libc/src/__support/GPU/utils.h
    libc/src/time/gpu/clock.cpp
    libc/src/time/gpu/nanosleep.cpp

Removed: 
    libc/src/__support/GPU/amdgpu/CMakeLists.txt
    libc/src/__support/GPU/amdgpu/utils.h
    libc/src/__support/GPU/generic/CMakeLists.txt
    libc/src/__support/GPU/generic/utils.h
    libc/src/__support/GPU/nvptx/CMakeLists.txt
    libc/src/__support/GPU/nvptx/utils.h


################################################################################
diff  --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index 28fd9a1ebcc97e..9b359f65cdb332 100644

--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -1,16 +1,12 @@
-if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+# These utilities are GPU only.
+if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
-add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
-set(target_gpu_utils libc.src.__support.GPU.${LIBC_TARGET_ARCHITECTURE}.${LIBC_TARGET_ARCHITECTURE}_utils)
-
 add_header_library(
   utils
   HDRS
     utils.h
-  DEPENDS
-    ${target_gpu_utils}
 )
 
 add_object_library(
@@ -21,6 +17,6 @@ add_object_library(
     allocator.h
   DEPENDS
     libc.src.__support.common
-    libc.src.__support.GPU.utils
     libc.src.__support.RPC.rpc_client
+    .utils
 )

diff  --git a/libc/src/__support/GPU/amdgpu/CMakeLists.txt b/libc/src/__support/GPU/amdgpu/CMakeLists.txt
deleted file mode 100644
index f2b98fc03b218d..00000000000000
--- a/libc/src/__support/GPU/amdgpu/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  amdgpu_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)

diff  --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
deleted file mode 100644
index 6ab95403ca3890..00000000000000
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ /dev/null
@@ -1,183 +0,0 @@
-//===-------------- AMDGPU implementation of GPU utils ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_AMDGPU_IO_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-/// Type aliases to the address spaces used by the AMDGPU backend.
-template <typename T> using Private = [[clang::opencl_private]] T;
-template <typename T> using Constant = [[clang::opencl_constant]] T;
-template <typename T> using Local = [[clang::opencl_local]] T;
-template <typename T> using Global = [[clang::opencl_global]] T;
-
-/// Returns the number of workgroups in the 'x' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_x() {
-  return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-}
-
-/// Returns the number of workgroups in the 'y' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_y() {
-  return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
-}
-
-/// Returns the number of workgroups in the 'z' dimension of the grid.
-LIBC_INLINE uint32_t get_num_blocks_z() {
-  return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
-}
-
-/// Returns the total number of workgruops in the grid.
-LIBC_INLINE uint64_t get_num_blocks() {
-  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
-}
-
-/// Returns the 'x' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_x() {
-  return __builtin_amdgcn_workgroup_id_x();
-}
-
-/// Returns the 'y' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_y() {
-  return __builtin_amdgcn_workgroup_id_y();
-}
-
-/// Returns the 'z' dimension of the current AMD workgroup's id.
-LIBC_INLINE uint32_t get_block_id_z() {
-  return __builtin_amdgcn_workgroup_id_z();
-}
-
-/// Returns the absolute id of the AMD workgroup.
-LIBC_INLINE uint64_t get_block_id() {
-  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
-         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
-}
-
-/// Returns the number of workitems in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_threads_x() {
-  return __builtin_amdgcn_workgroup_size_x();
-}
-
-/// Returns the number of workitems in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_threads_y() {
-  return __builtin_amdgcn_workgroup_size_y();
-}
-
-/// Returns the number of workitems in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_threads_z() {
-  return __builtin_amdgcn_workgroup_size_z();
-}
-
-/// Returns the total number of workitems in the workgroup.
-LIBC_INLINE uint64_t get_num_threads() {
-  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
-}
-
-/// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_x() {
-  return __builtin_amdgcn_workitem_id_x();
-}
-
-/// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_y() {
-  return __builtin_amdgcn_workitem_id_y();
-}
-
-/// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
-LIBC_INLINE uint32_t get_thread_id_z() {
-  return __builtin_amdgcn_workitem_id_z();
-}
-
-/// Returns the absolute id of the thread in the current AMD workgroup.
-LIBC_INLINE uint64_t get_thread_id() {
-  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
-         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
-}
-
-/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
-/// and compilation options.
-LIBC_INLINE uint32_t get_lane_size() {
-  return __builtin_amdgcn_wavefrontsize();
-}
-
-/// Returns the id of the thread inside of an AMD wavefront executing together.
-[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-/// Returns the bit-mask of active threads in the current wavefront.
-[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
-  return __builtin_amdgcn_read_exec();
-}
-
-/// Copies the value from the first active thread in the wavefront to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t,
-                                                           uint32_t x) {
-  return __builtin_amdgcn_readfirstlane(x);
-}
-
-/// Returns a bitmask of threads in the current lane for which \p x is true.
-[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  // the lane_mask & gives the nvptx semantics when lane_mask is a subset of
-  // the active threads
-  return lane_mask & __builtin_amdgcn_ballot_w64(x);
-}
-
-/// Waits for all the threads in the block to converge and issues a fence.
-[[clang::convergent]] LIBC_INLINE void sync_threads() {
-  __builtin_amdgcn_s_barrier();
-  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
-}
-
-/// Waits for all pending memory operations to complete in program order.
-[[clang::convergent]] LIBC_INLINE void memory_fence() {
-  __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
-}
-
-/// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
-[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t) {
-  __builtin_amdgcn_wave_barrier();
-}
-
-/// Shuffles the the lanes inside the wavefront according to the given index.
-[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t idx,
-                                                   uint32_t x) {
-  return __builtin_amdgcn_ds_bpermute(idx << 2, x);
-}
-
-/// Returns the current value of the GPU's processor clock.
-/// NOTE: The RDNA3 and RDNA2 architectures use a 20-bit cycle counter.
-LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
-
-/// Returns a fixed-frequency timestamp. The actual frequency is dependent on
-/// the card and can only be queried via the driver.
-LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
-}
-
-/// Terminates execution of the associated wavefront.
-[[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); }
-
-/// Returns a unique identifier for the process cluster the current wavefront is
-/// executing on. Here we use the identifier for the compute unit (CU) and
-/// shader engine.
-/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface
-/// than the one at
-/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899
-LIBC_INLINE uint32_t get_cluster_id() { return 0; }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif

diff  --git a/libc/src/__support/GPU/generic/CMakeLists.txt b/libc/src/__support/GPU/generic/CMakeLists.txt
deleted file mode 100644
index 68ba7d1ec80e95..00000000000000
--- a/libc/src/__support/GPU/generic/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  generic_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)

diff  --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
deleted file mode 100644
index 9461ef0aa245b7..00000000000000
--- a/libc/src/__support/GPU/generic/utils.h
+++ /dev/null
@@ -1,84 +0,0 @@
-//===-------------- Generic implementation of GPU utils ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-template <typename T> using Private = T;
-template <typename T> using Constant = T;
-template <typename T> using Shared = T;
-template <typename T> using Global = T;
-
-LIBC_INLINE uint32_t get_num_blocks_x() { return 1; }
-
-LIBC_INLINE uint32_t get_num_blocks_y() { return 1; }
-
-LIBC_INLINE uint32_t get_num_blocks_z() { return 1; }
-
-LIBC_INLINE uint64_t get_num_blocks() { return 1; }
-
-LIBC_INLINE uint32_t get_block_id_x() { return 0; }
-
-LIBC_INLINE uint32_t get_block_id_y() { return 0; }
-
-LIBC_INLINE uint32_t get_block_id_z() { return 0; }
-
-LIBC_INLINE uint64_t get_block_id() { return 0; }
-
-LIBC_INLINE uint32_t get_num_threads_x() { return 1; }
-
-LIBC_INLINE uint32_t get_num_threads_y() { return 1; }
-
-LIBC_INLINE uint32_t get_num_threads_z() { return 1; }
-
-LIBC_INLINE uint64_t get_num_threads() { return 1; }
-
-LIBC_INLINE uint32_t get_thread_id_x() { return 0; }
-
-LIBC_INLINE uint32_t get_thread_id_y() { return 0; }
-
-LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
-
-LIBC_INLINE uint64_t get_thread_id() { return 0; }
-
-LIBC_INLINE uint32_t get_lane_size() { return 1; }
-
-LIBC_INLINE uint32_t get_lane_id() { return 0; }
-
-LIBC_INLINE uint64_t get_lane_mask() { return 1; }
-
-LIBC_INLINE uint32_t broadcast_value(uint64_t, uint32_t x) { return x; }
-
-LIBC_INLINE uint64_t ballot(uint64_t, bool x) { return x; }
-
-LIBC_INLINE void sync_threads() {}
-
-LIBC_INLINE void sync_lane(uint64_t) {}
-
-LIBC_INLINE uint32_t shuffle(uint64_t, uint32_t, uint32_t x) { return x; }
-
-LIBC_INLINE uint64_t processor_clock() { return 0; }
-
-LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }
-
-[[noreturn]] LIBC_INLINE void end_program() { __builtin_unreachable(); }
-
-LIBC_INLINE uint32_t get_cluster_id() { return 0; }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_GPU_GENERIC_UTILS_H

diff  --git a/libc/src/__support/GPU/nvptx/CMakeLists.txt b/libc/src/__support/GPU/nvptx/CMakeLists.txt
deleted file mode 100644
index 0d3f8c7933c86c..00000000000000
--- a/libc/src/__support/GPU/nvptx/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  nvptx_utils
-  HDRS
-    utils.h
-  DEPENDS
-    libc.src.__support.common
-)

diff  --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
deleted file mode 100644
index 1a43a839a9ce46..00000000000000
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ /dev/null
@@ -1,160 +0,0 @@
-//===-------------- NVPTX implementation of GPU utils -----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-id: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
-#define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
-
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-
-#include <stdint.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace gpu {
-
-/// Type aliases to the address spaces used by the NVPTX backend.
-template <typename T> using Private = [[clang::opencl_private]] T;
-template <typename T> using Constant = [[clang::opencl_constant]] T;
-template <typename T> using Local = [[clang::opencl_local]] T;
-template <typename T> using Global = [[clang::opencl_global]] T;
-
-/// Returns the number of CUDA blocks in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_blocks_x() {
-  return __nvvm_read_ptx_sreg_nctaid_x();
-}
-
-/// Returns the number of CUDA blocks in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_blocks_y() {
-  return __nvvm_read_ptx_sreg_nctaid_y();
-}
-
-/// Returns the number of CUDA blocks in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_blocks_z() {
-  return __nvvm_read_ptx_sreg_nctaid_z();
-}
-
-/// Returns the total number of CUDA blocks.
-LIBC_INLINE uint64_t get_num_blocks() {
-  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
-}
-
-/// Returns the 'x' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
-
-/// Returns the 'y' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
-
-/// Returns the 'z' dimension of the current CUDA block's id.
-LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
-
-/// Returns the absolute id of the CUDA block.
-LIBC_INLINE uint64_t get_block_id() {
-  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
-         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
-}
-
-/// Returns the number of CUDA threads in the 'x' dimension.
-LIBC_INLINE uint32_t get_num_threads_x() {
-  return __nvvm_read_ptx_sreg_ntid_x();
-}
-
-/// Returns the number of CUDA threads in the 'y' dimension.
-LIBC_INLINE uint32_t get_num_threads_y() {
-  return __nvvm_read_ptx_sreg_ntid_y();
-}
-
-/// Returns the number of CUDA threads in the 'z' dimension.
-LIBC_INLINE uint32_t get_num_threads_z() {
-  return __nvvm_read_ptx_sreg_ntid_z();
-}
-
-/// Returns the total number of threads in the block.
-LIBC_INLINE uint64_t get_num_threads() {
-  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
-}
-
-/// Returns the 'x' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
-
-/// Returns the 'y' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
-
-/// Returns the 'z' dimension id of the thread in the current CUDA block.
-LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
-
-/// Returns the absolute id of the thread in the current CUDA block.
-LIBC_INLINE uint64_t get_thread_id() {
-  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
-         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
-}
-
-/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
-LIBC_INLINE uint32_t get_lane_size() { return 32; }
-
-/// Returns the id of the thread inside of a CUDA warp executing together.
-[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
-  return __nvvm_read_ptx_sreg_laneid();
-}
-
-/// Returns the bit-mask of active threads in the current warp.
-[[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
-  return __nvvm_activemask();
-}
-
-/// Copies the value from the first active thread in the warp to the rest.
-[[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
-                                                           uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t id = __builtin_ffs(mask) - 1;
-  return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
-}
-
-/// Returns a bitmask of threads in the current lane for which \p x is true.
-[[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  return __nvvm_vote_ballot_sync(mask, x);
-}
-
-/// Waits for all the threads in the block to converge and issues a fence.
-[[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
-
-/// Waits for all pending memory operations to complete in program order.
-[[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
-
-/// Waits for all threads in the warp to reconverge for independent scheduling.
-[[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
-  __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
-}
-
-/// Shuffles the the lanes inside the warp according to the given index.
-[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
-                                                   uint32_t idx, uint32_t x) {
-  uint32_t mask = static_cast<uint32_t>(lane_mask);
-  uint32_t bitmask = (mask >> idx) & 1;
-  return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
-}
-
-/// Returns the current value of the GPU's processor clock.
-LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
-
-/// Returns a global fixed-frequency timer at nanosecond frequency.
-LIBC_INLINE uint64_t fixed_frequency_clock() {
-  return __builtin_readsteadycounter();
-}
-
-/// Terminates execution of the calling thread.
-[[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); }
-
-/// Returns a unique identifier for the process cluster the current warp is
-/// executing on. Here we use the identifier for the symmetric multiprocessor.
-LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); }
-
-} // namespace gpu
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif

diff  --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index ae52e7a088ad51..e138c84c0cb22d 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -9,48 +9,108 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
 #define LLVM_LIBC_SRC___SUPPORT_GPU_UTILS_H
 
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
 
-#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#include "amdgpu/utils.h"
-#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
-#include "nvptx/utils.h"
-#else
-#include "generic/utils.h"
+#if !__has_include(<gpuintrin.h>)
+#error "Unsupported compiler"
 #endif
 
+#include <gpuintrin.h>
+
 namespace LIBC_NAMESPACE_DECL {
 namespace gpu {
-/// Get the first active thread inside the lane.
-LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) {
-  return __builtin_ffsll(lane_mask) - 1;
+
+template <typename T> using Private = __gpu_private T;
+template <typename T> using Constant = __gpu_constant T;
+template <typename T> using Local = __gpu_local T;
+template <typename T> using Global = __gpu_local T;
+
+LIBC_INLINE uint32_t get_num_blocks_x() { return __gpu_num_blocks(0); }
+
+LIBC_INLINE uint32_t get_num_blocks_y() { return __gpu_num_blocks(1); }
+
+LIBC_INLINE uint32_t get_num_blocks_z() { return __gpu_num_blocks(2); }
+
+LIBC_INLINE uint64_t get_num_blocks() {
+  return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
+}
+
+LIBC_INLINE uint32_t get_block_id_x() { return __gpu_block_id(0); }
+
+LIBC_INLINE uint32_t get_block_id_y() { return __gpu_block_id(1); }
+
+LIBC_INLINE uint32_t get_block_id_z() { return __gpu_block_id(2); }
+
+LIBC_INLINE uint64_t get_block_id() {
+  return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
+         get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
+}
+
+LIBC_INLINE uint32_t get_num_threads_x() { return __gpu_num_threads(0); }
+
+LIBC_INLINE uint32_t get_num_threads_y() { return __gpu_num_threads(1); }
+
+LIBC_INLINE uint32_t get_num_threads_z() { return __gpu_num_threads(2); }
+
+LIBC_INLINE uint64_t get_num_threads() {
+  return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
+}
+
+LIBC_INLINE uint32_t get_thread_id_x() { return __gpu_thread_id(0); }
+
+LIBC_INLINE uint32_t get_thread_id_y() { return __gpu_thread_id(1); }
+
+LIBC_INLINE uint32_t get_thread_id_z() { return __gpu_thread_id(2); }
+
+LIBC_INLINE uint64_t get_thread_id() {
+  return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
+         get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
+}
+
+LIBC_INLINE uint32_t get_lane_size() { return __gpu_num_lanes(); }
+
+LIBC_INLINE uint32_t get_lane_id() { return __gpu_lane_id(); }
+
+LIBC_INLINE uint64_t get_lane_mask() { return __gpu_lane_mask(); }
+
+LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
+  return __gpu_read_first_lane_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+  return __gpu_ballot(lane_mask, x);
+}
+
+LIBC_INLINE void sync_threads() { __gpu_sync_threads(); }
+
+LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); }
+
+LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) {
+  return __gpu_shuffle_idx_u32(lane_mask, idx, x);
 }
 
-/// Conditional that is only true for a single thread in a lane.
+[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
+
 LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
-  return gpu::get_lane_id() == get_first_lane_id(lane_mask);
+  return __gpu_is_first_in_lane(lane_mask);
 }
 
-/// Gets the sum of all lanes inside the warp or wavefront.
 LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) {
-  for (uint32_t step = gpu::get_lane_size() / 2; step > 0; step /= 2) {
-    uint32_t index = step + gpu::get_lane_id();
-    x += gpu::shuffle(lane_mask, index, x);
-  }
-  return gpu::broadcast_value(lane_mask, x);
+  return __gpu_lane_sum_u32(lane_mask, x);
 }
 
-/// Gets the accumulator scan of the threads in the warp or wavefront.
 LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) {
-  for (uint32_t step = 1; step < gpu::get_lane_size(); step *= 2) {
-    uint32_t index = gpu::get_lane_id() - step;
-    uint32_t bitmask = gpu::get_lane_id() >= step;
-    x += -bitmask & gpu::shuffle(lane_mask, index, x);
-  }
-  return x;
+  return __gpu_lane_scan_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t fixed_frequency_clock() {
+  return __builtin_readsteadycounter();
 }
 
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
 

diff  --git a/libc/src/time/gpu/clock.cpp b/libc/src/time/gpu/clock.cpp
index add5b2725ef8f0..8609c5cd6b6b73 100644
--- a/libc/src/time/gpu/clock.cpp
+++ b/libc/src/time/gpu/clock.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/time/clock.h"
+
+#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"
 

diff  --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp
index a92f660f225cb1..d22d9d6bd8d792 100644
--- a/libc/src/time/gpu/nanosleep.cpp
+++ b/libc/src/time/gpu/nanosleep.cpp
@@ -8,6 +8,7 @@
 
 #include "src/time/nanosleep.h"
 
+#include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/gpu/time_utils.h"