[llvm] [OpenMP] Replace most GPU helpers with ones from <gpuintrin.h> (PR #125771)

Tue Feb 4 14:17:45 PST 2025

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/125771

Summary:
This patch cleans up the runtime by using the definitions from
`<gpuintrin.h>` instead. This reduces complexity and makes it easier to
port. I have left a handful leftover, atomicInc, shuffle, and the sleep
calls. These are not easily replaced but I will work on it.


>From 6a2bf1dc8c7669161b1c7b089e0c998299ee846f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 20 Jan 2025 14:41:34 -0600
Subject: [PATCH 1/2] [OpenMP] Port the OpenMP device runtime to direct C++
 compilation

Summary:
This removes the use of OpenMP offloading to build the device runtime.
The main benefit here is that we no longer need to rely on offloading
semantics to build a device only runtime. Things like variants are now
no longer needed and can just be simple if-defs. In the future, I will
remove most of the special handling here and fold it into calls to the
`<gpuintrin.h>` functions instead. Additionally I will rework the
compilation to make this a separate runtime.

The current plan is to have this, but make including OpenMP and
offloading either automatically add it, or print a warning if it's
missing. This will allow us to use a normal CMake workflow and delete
all the weird 'lets pull the clang binary out of the build' business.
```
-DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=offload
-DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa
```

After that, linking the OpenMP device runtime will be `-Xoffload-linker
-lomp`. I.e. no more fat binary business.
---
 offload/DeviceRTL/CMakeLists.txt            | 18 ++++-------
 offload/DeviceRTL/include/Allocator.h       |  4 ---
 offload/DeviceRTL/include/DeviceTypes.h     | 10 +-----
 offload/DeviceRTL/include/DeviceUtils.h     |  4 ---
 offload/DeviceRTL/include/Mapping.h         |  4 ---
 offload/DeviceRTL/include/State.h           | 10 ++----
 offload/DeviceRTL/include/Synchronization.h |  4 ---
 offload/DeviceRTL/include/Workshare.h       |  4 ---
 offload/DeviceRTL/src/Allocator.cpp         |  4 ---
 offload/DeviceRTL/src/Configuration.cpp     |  4 ---
 offload/DeviceRTL/src/Debug.cpp             |  4 ---
 offload/DeviceRTL/src/DeviceUtils.cpp       | 16 +++------
 offload/DeviceRTL/src/Kernel.cpp            |  4 ---
 offload/DeviceRTL/src/LibC.cpp              |  4 ---
 offload/DeviceRTL/src/Mapping.cpp           | 30 +++--------------
 offload/DeviceRTL/src/Misc.cpp              | 17 +++-------
 offload/DeviceRTL/src/Parallelism.cpp       |  4 ---
 offload/DeviceRTL/src/Profiling.cpp         |  4 ---
 offload/DeviceRTL/src/Reduction.cpp         |  4 ---
 offload/DeviceRTL/src/State.cpp             | 19 ++++-------
 offload/DeviceRTL/src/Synchronization.cpp   | 36 +++------------------
 offload/DeviceRTL/src/Tasking.cpp           |  6 +---
 offload/DeviceRTL/src/Workshare.cpp         |  4 ---
 23 files changed, 34 insertions(+), 184 deletions(-)

diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 099634e211e7a7..8f2a1fd01fabcc 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -95,11 +95,10 @@ set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
 list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
 
 # Set flags for LLVM Bitcode compilation.
-set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
-              ${clang_opt_flags} --offload-device-only
-             -nocudalib -nogpulib -nogpuinc -nostdlibinc
-             -fopenmp -fopenmp-cuda-mode
-             -Wno-unknown-cuda-version -Wno-openmp-target
+set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
+             ${clang_opt_flags} -nogpulib -nostdlibinc
+             -fno-rtti -fno-exceptions -fconvergent-functions
+             -Wno-unknown-cuda-version
              -DOMPTARGET_DEVICE_RUNTIME
              -I${include_directory}
              -I${devicertl_base_directory}/../include
@@ -123,8 +122,7 @@ function(compileDeviceRTLLibrary target_name target_triple)
     add_custom_command(OUTPUT ${outfile}
       COMMAND ${CLANG_TOOL}
       ${bc_flags}
-      -fopenmp-targets=${target_triple}
-      -Xopenmp-target=${target_triple} -march=
+      --target=${target_triple}
       ${target_bc_flags}
       -MD -MF ${depfile}
       ${infile} -o ${outfile}
@@ -242,10 +240,8 @@ function(compileDeviceRTLLibrary target_name target_triple)
     set(ide_target_name omptarget-ide-${target_name})
     add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
     target_compile_options(${ide_target_name} PRIVATE
-      -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
-      -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
-      -foffload-lto -fvisibility=hidden --offload-device-only
-      -nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
+      -fvisibility=hidden --target=${target_triple}
+      -nogpulib -nostdlibinc -Wno-unknown-cuda-version
     )
     target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
     target_include_directories(${ide_target_name} PRIVATE
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
index 475f6a21bb47eb..79c69a2a96b4e9 100644
--- a/offload/DeviceRTL/include/Allocator.h
+++ b/offload/DeviceRTL/include/Allocator.h
@@ -17,8 +17,6 @@
 // Forward declaration.
 struct KernelEnvironmentTy;
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 
 namespace allocator {
@@ -44,6 +42,4 @@ extern "C" {
 [[gnu::weak]] void free(void *Ptr);
 }
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 1cd044f432e569..308109b0749f05 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -99,14 +99,7 @@ struct TaskDescriptorTy {
   TaskFnTy TaskFn;
 };
 
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
 using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match(                                       \
-        device = {arch(amdgcn)}, implementation = {extension(match_none)})
-using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
 
 namespace lanes {
 enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
@@ -163,8 +156,7 @@ typedef enum omp_allocator_handle_t {
 #define OMP_PRAGMA(STR) __PRAGMA(omp STR)
 
 #define SHARED(NAME)                                                           \
-  NAME [[clang::loader_uninitialized]];                                        \
-  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+  [[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
 
 // TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
 //       now that's not the case.
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
index 2243673aef61c7..b92514ee9838a1 100644
--- a/offload/DeviceRTL/include/DeviceUtils.h
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -15,8 +15,6 @@
 #include "DeviceTypes.h"
 #include "Shared/Utils.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace utils {
 
 template <typename T> struct type_identity {
@@ -95,6 +93,4 @@ bool isThreadLocalMemPtr(void *Ptr);
 
 } // namespace utils
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 2217eb7616b386..f892a025159d48 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -24,12 +24,8 @@ enum {
   DIM_Z = 2,
 };
 
-#pragma omp begin declare target device_type(nohost)
-
 inline constexpr uint32_t MaxThreadsPerTeam = 1024;
 
-#pragma omp end declare target
-
 /// Initialize the mapping machinery.
 void init(bool IsSPMD);
 
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index f0500c1083d7f4..58b619ff1072ae 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -22,8 +22,6 @@
 // Forward declaration.
 struct KernelEnvironmentTy;
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 
 namespace memory {
@@ -88,8 +86,7 @@ struct TeamStateTy {
   ParallelRegionFnTy ParallelRegionFnVar;
 };
 
-extern TeamStateTy TeamState;
-#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
+extern TeamStateTy [[clang::address_space(3)]] TeamState;
 
 struct ThreadStateTy {
 
@@ -115,8 +112,7 @@ struct ThreadStateTy {
   }
 };
 
-extern ThreadStateTy **ThreadStates;
-#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
+extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
 
 /// Initialize the state machinery. Must be called by all threads.
 void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
@@ -378,6 +374,4 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;
 
 } // namespace ompx
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index 5045d3c2c99a33..f9eb8d0d231983 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -15,8 +15,6 @@
 #include "DeviceTypes.h"
 #include "DeviceUtils.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 namespace atomic {
 
@@ -220,6 +218,4 @@ void system(atomic::OrderingTy Ordering);
 
 } // namespace ompx
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h
index fa9b3b2430b8c4..554c3271c334c0 100644
--- a/offload/DeviceRTL/include/Workshare.h
+++ b/offload/DeviceRTL/include/Workshare.h
@@ -12,8 +12,6 @@
 #ifndef OMPTARGET_WORKSHARE_H
 #define OMPTARGET_WORKSHARE_H
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 
 namespace workshare {
@@ -25,6 +23,4 @@ void init(bool IsSPMD);
 
 } // namespace ompx
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
index ac662c48d4f5fb..aac2a6005158ef 100644
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ b/offload/DeviceRTL/src/Allocator.cpp
@@ -19,8 +19,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 [[gnu::used, gnu::retain, gnu::weak,
   gnu::visibility(
       "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
@@ -77,5 +75,3 @@ void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
 void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
 
 ///}
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index 0b488b8034178d..a2dfa4a02a0947 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -17,8 +17,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 // Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
 [[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
 [[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
@@ -85,5 +83,3 @@ bool config::mayUseNestedParallelism() {
     return false;
   return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index 1d9c9628854222..5b5482d766b1d0 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -21,8 +21,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 extern "C" {
 void __assert_assume(bool condition) { __builtin_assume(condition); }
 
@@ -44,5 +42,3 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
   __builtin_trap();
 }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
index c204a7be73b1fc..d8109537832e96 100644
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -15,14 +15,10 @@
 #include "Interface.h"
 #include "Mapping.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 using namespace ompx;
 
 namespace impl {
 
-bool isSharedMemPtr(const void *Ptr) { return false; }
-
 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
   static_assert(sizeof(unsigned long) == 8, "");
   *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
@@ -42,7 +38,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
 /// AMDGCN Implementation
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
 
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   int Self = mapping::getThreadIdInWarp();
@@ -66,15 +62,13 @@ bool isSharedMemPtr(const void *Ptr) {
   return __builtin_amdgcn_is_shared(
       (const __attribute__((address_space(0))) void *)Ptr);
 }
-#pragma omp end declare variant
+#endif
 ///}
 
 /// NVPTX Implementation
 ///
 ///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
+#ifdef __NVPTX__
 
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
@@ -91,7 +85,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
 
 bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
 
-#pragma omp end declare variant
+#endif
 ///}
 } // namespace impl
 
@@ -137,5 +131,3 @@ int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
   return utils::shuffleDown(lanes::All, Val, Delta, Width);
 }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 8bb275eae776c6..9bb89573dc0cb8 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -25,8 +25,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 static void
 inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
                     KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
@@ -155,5 +153,3 @@ void __kmpc_target_deinit() {
 
 int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
index e55008f46269fe..83f9233d948032 100644
--- a/offload/DeviceRTL/src/LibC.cpp
+++ b/offload/DeviceRTL/src/LibC.cpp
@@ -8,8 +8,6 @@
 
 #include "LibC.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 #if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
 extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
 #else
@@ -48,5 +46,3 @@ namespace ompx {
   return ::vprintf(Format, vlist);
 }
 } // namespace ompx
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 8583a539824c82..a0c0f6721a84cc 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -15,8 +15,6 @@
 #include "Interface.h"
 #include "State.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 using namespace ompx;
@@ -24,24 +22,10 @@ using namespace ompx;
 namespace ompx {
 namespace impl {
 
-// Forward declarations defined to be defined for AMDGCN and NVPTX.
-LaneMaskTy activemask();
-LaneMaskTy lanemaskLT();
-LaneMaskTy lanemaskGT();
-uint32_t getThreadIdInWarp();
-uint32_t getThreadIdInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInKernel();
-uint32_t getBlockIdInKernel(int32_t Dim);
-uint32_t getNumberOfBlocksInKernel(int32_t Dim);
-uint32_t getWarpIdInBlock();
-uint32_t getNumberOfWarpsInBlock();
-uint32_t getWarpSize();
-
 /// AMDGCN Implementation
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
 
 uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
 
@@ -128,15 +112,13 @@ uint32_t getNumberOfWarpsInBlock() {
   return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
 }
 
-#pragma omp end declare variant
+#endif
 ///}
 
 /// NVPTX Implementation
 ///
 ///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
+#ifdef __NVPTX__
 
 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
   switch (Dim) {
@@ -214,7 +196,7 @@ uint32_t getNumberOfWarpsInBlock() {
          mapping::getWarpSize();
 }
 
-#pragma omp end declare variant
+#endif
 ///}
 
 } // namespace impl
@@ -376,7 +358,7 @@ float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
 }
 
 long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
-  return utils::shuffleDown(mask, var, delta, width);
+  return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
 }
 
 double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
@@ -385,5 +367,3 @@ double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
       utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
 }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index 010474b1c4a74d..734e937f039201 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -17,19 +17,13 @@
 
 #include "Debug.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 namespace impl {
 
-double getWTick();
-
-double getWTime();
-
 /// AMDGCN Implementation
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
 
 double getWTick() {
   // The number of ticks per second for the AMDGPU clock varies by card and can
@@ -42,14 +36,12 @@ double getWTime() {
   return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
 }
 
-#pragma omp end declare variant
+#endif
 
 /// NVPTX Implementation
 ///
 ///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
+#ifdef __NVPTX__
 
 double getWTick() {
   // Timer precision is 1ns
@@ -61,7 +53,7 @@ double getWTime() {
   return static_cast<double>(nsecs) * getWTick();
 }
 
-#pragma omp end declare variant
+#endif
 
 /// Lookup a device-side function using a host pointer /p HstPtr using the table
 /// provided by the device plugin. The table is an ordered pair of host and
@@ -171,4 +163,3 @@ unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
 }
 
 ///}
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index a87e363349b1e5..08ce616aee1c4c 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -43,8 +43,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace {
 
 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
@@ -311,5 +309,3 @@ void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
 
 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp
index bb3caaadcc03dd..df141af5ebeeaf 100644
--- a/offload/DeviceRTL/src/Profiling.cpp
+++ b/offload/DeviceRTL/src/Profiling.cpp
@@ -8,8 +8,6 @@
 
 #include "Profiling.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 extern "C" {
 
 // Provides empty implementations for certain functions in compiler-rt
@@ -18,5 +16,3 @@ void __llvm_profile_register_function(void *Ptr) {}
 void __llvm_profile_register_names_function(void *Ptr, long int I) {}
 void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 382f6cf392e91a..25f34005532f7c 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -22,8 +22,6 @@ using namespace ompx;
 
 namespace {
 
-#pragma omp begin declare target device_type(nohost)
-
 void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
     shflFct(reduce_data, /*LaneId - not used= */ 0,
@@ -316,5 +314,3 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 void *__kmpc_reduction_get_fixed_buffer() {
   return state::getKernelLaunchEnvironment().ReductionBuffer;
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 100bc8ab47983c..89edb4802198c9 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -23,16 +23,13 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 /// Memory implementation
 ///
 ///{
 
 /// External symbol to access dynamic shared memory.
-[[gnu::aligned(
-    allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
-#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
+[[gnu::aligned(allocator::ALIGNMENT)]] extern unsigned char
+    [[clang::address_space(3)]] DynamicSharedBuffer[];
 
 /// The kernel environment passed to the init method by the compiler.
 static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
@@ -452,13 +449,10 @@ void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
 
-[[clang::loader_uninitialized]] static void
-    *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
-#pragma omp allocate(SharedMemVariableSharingSpace)                            \
-    allocator(omp_pteam_mem_alloc)
-[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
-#pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
-    allocator(omp_pteam_mem_alloc)
+[[clang::loader_uninitialized]] static void *[[clang::address_space(
+    3)]] SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
+[[clang::loader_uninitialized]] static void **[[clang::address_space(
+    3)]] SharedMemVariableSharingSpacePtr;
 
 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
@@ -481,4 +475,3 @@ void __kmpc_get_shared_variables(void ***GlobalArgs) {
   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 }
 }
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index b09d4801faa012..a5090b96560c8b 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -19,8 +19,6 @@
 #include "Mapping.h"
 #include "State.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 using namespace ompx;
 
 namespace impl {
@@ -28,34 +26,12 @@ namespace impl {
 /// Atomics
 ///
 ///{
-/// NOTE: This function needs to be implemented by every target.
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope);
 ///}
 
-// Forward declarations defined to be defined for AMDGCN and NVPTX.
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope);
-void namedBarrierInit();
-void namedBarrier();
-void fenceTeam(atomic::OrderingTy Ordering);
-void fenceKernel(atomic::OrderingTy Ordering);
-void fenceSystem(atomic::OrderingTy Ordering);
-void syncWarp(__kmpc_impl_lanemask_t);
-void syncThreads(atomic::OrderingTy Ordering);
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
-void unsetLock(omp_lock_t *);
-int testLock(omp_lock_t *);
-void initLock(omp_lock_t *);
-void destroyLock(omp_lock_t *);
-void setLock(omp_lock_t *);
-void unsetCriticalLock(omp_lock_t *);
-void setCriticalLock(omp_lock_t *);
-
 /// AMDGCN Implementation
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
 
 uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
                    atomic::MemScopeTy MemScope) {
@@ -202,15 +178,13 @@ void setCriticalLock(omp_lock_t *Lock) {
   }
 }
 
-#pragma omp end declare variant
+#endif
 ///}
 
 /// NVPTX Implementation
 ///
 ///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
+#ifdef __NVPTX__
 
 uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
                    atomic::MemScopeTy MemScope) {
@@ -283,7 +257,7 @@ void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
 
 void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
 
-#pragma omp end declare variant
+#endif
 ///}
 
 } // namespace impl
@@ -401,5 +375,3 @@ void ompx_sync_block_divergent(int Ordering) {
   impl::syncThreads(atomic::OrderingTy(Ordering));
 }
 } // extern "C"
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
index 23a967c1a337e2..d0be0ace50dff4 100644
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ b/offload/DeviceRTL/src/Tasking.cpp
@@ -20,8 +20,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 extern "C" {
 
 TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
@@ -29,7 +27,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
                                         size_t SharedValuesSize,
                                         TaskFnTy TaskFn) {
   auto TaskSizeInclPrivateValuesPadded =
-      utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
+      utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *));
   auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
   TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
       TaskSizeTotal, "explicit task descriptor");
@@ -103,5 +101,3 @@ int omp_in_final(void) {
 
 int omp_get_max_task_priority(void) { return 0; }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index cb83f1b670c9ee..b1f037a11bddf1 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -44,8 +44,6 @@ struct DynamicScheduleTracker {
 #define NOT_FINISHED 1
 #define LAST_CHUNK 2
 
-#pragma omp begin declare target device_type(nohost)
-
 // TODO: This variable is a hack inherited from the old runtime.
 static uint64_t SHARED(Cnt);
 
@@ -935,5 +933,3 @@ OMP_LOOP_ENTRY(_4u, uint32_t)
 OMP_LOOP_ENTRY(_8, int64_t)
 OMP_LOOP_ENTRY(_8u, uint64_t)
 }
-
-#pragma omp end declare target

>From 57ce9e651cdc262609c170ba01378d422cf9d722 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 4 Feb 2025 16:15:35 -0600
Subject: [PATCH 2/2] [OpenMP] Replace most GPU helpers with ones from
 <gpuintrin.h>

Summary:
This patch cleans up the runtime by using the definitions from
`<gpuintrin.h>` instead. This reduces complexity and makes it easier to
port. I have left a handful leftover, atomicInc, shuffle, and the sleep
calls. These are not easily replaced but I will work on it.
---
 offload/DeviceRTL/include/DeviceTypes.h   |  14 +-
 offload/DeviceRTL/src/Configuration.cpp   |   4 +-
 offload/DeviceRTL/src/DeviceUtils.cpp     |  30 +--
 offload/DeviceRTL/src/Mapping.cpp         | 255 ++++------------------
 offload/DeviceRTL/src/Misc.cpp            |  11 +-
 offload/DeviceRTL/src/Reduction.cpp       |   4 +-
 offload/DeviceRTL/src/State.cpp           |  15 +-
 offload/DeviceRTL/src/Synchronization.cpp |   2 +-
 offload/DeviceRTL/src/Workshare.cpp       |   5 +-
 9 files changed, 68 insertions(+), 272 deletions(-)

diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 308109b0749f05..395d72eafbf405 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -12,6 +12,7 @@
 #ifndef OMPTARGET_TYPES_H
 #define OMPTARGET_TYPES_H
 
+#include <gpuintrin.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -155,19 +156,6 @@ typedef enum omp_allocator_handle_t {
 #define __PRAGMA(STR) _Pragma(#STR)
 #define OMP_PRAGMA(STR) __PRAGMA(omp STR)
 
-#define SHARED(NAME)                                                           \
-  [[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
-
-// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
-//       now that's not the case.
-#define THREAD_LOCAL(NAME)                                                     \
-  [[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
-
-// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
-//       does?
-#define CONSTANT(NAME)                                                         \
-  [[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
-
 ///}
 
 #endif
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index a2dfa4a02a0947..796e9ee254f3ac 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -28,8 +28,8 @@ using namespace ompx;
 // This variable should be visible to the plugin so we override the default
 // hidden visibility.
 [[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceEnvironmentTy
-    CONSTANT(__omp_rtl_device_environment);
+  gnu::visibility("protected")]] DeviceEnvironmentTy __gpu_constant
+    __omp_rtl_device_environment;
 
 uint32_t config::getAssumeTeamsOversubscription() {
   return __omp_rtl_assume_teams_oversubscription;
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
index d8109537832e96..50022873a65b14 100644
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -15,6 +15,8 @@
 #include "Interface.h"
 #include "Mapping.h"
 
+#include <gpuintrin.h>
+
 using namespace ompx;
 
 namespace impl {
@@ -29,17 +31,12 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
   return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
 }
 
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
-                    int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
 /// AMDGCN Implementation
 ///
 ///{
 #ifdef __AMDGPU__
 
+// TODO: Move this to <gpuintrin.h>.
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   int Self = mapping::getThreadIdInWarp();
   int Index = SrcLane + (Self & ~(Width - 1));
@@ -53,15 +50,6 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
   Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
 }
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
-  return Mask & __builtin_amdgcn_ballot_w64(Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) {
-  return __builtin_amdgcn_is_shared(
-      (const __attribute__((address_space(0))) void *)Ptr);
-}
 #endif
 ///}
 
@@ -70,6 +58,7 @@ bool isSharedMemPtr(const void *Ptr) {
 ///{
 #ifdef __NVPTX__
 
+// TODO: Move this to <gpuintrin.h>.
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
 }
@@ -78,13 +67,6 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
   int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
 }
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
-  return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
-
 #endif
 ///}
 } // namespace impl
@@ -117,10 +99,10 @@ int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
 }
 
 uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
-  return impl::ballotSync(Mask, Pred);
+  return __gpu_ballot(Mask, Pred);
 }
 
-bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
+bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
 
 extern "C" {
 int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index a0c0f6721a84cc..8929692114e61e 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -15,195 +15,12 @@
 #include "Interface.h"
 #include "State.h"
 
+#include <gpuintrin.h>
+
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 using namespace ompx;
 
-namespace ompx {
-namespace impl {
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
-LaneMaskTy lanemaskLT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
-  return Mask & Ballot;
-}
-
-LaneMaskTy lanemaskGT() {
-  uint32_t Lane = mapping::getThreadIdInWarp();
-  if (Lane == (mapping::getWarpSize() - 1))
-    return 0;
-  int64_t Ballot = mapping::activemask();
-  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
-  return Mask & Ballot;
-}
-
-uint32_t getThreadIdInWarp() {
-  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workitem_id_x();
-  case 1:
-    return __builtin_amdgcn_workitem_id_y();
-  case 2:
-    return __builtin_amdgcn_workitem_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
-         __builtin_amdgcn_grid_size_z();
-}
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_workgroup_id_x();
-  case 1:
-    return __builtin_amdgcn_workgroup_id_y();
-  case 2:
-    return __builtin_amdgcn_workgroup_id_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
-  case 1:
-    return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
-  case 2:
-    return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ntid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ntid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ntid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_tid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_tid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_tid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_ctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_ctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_ctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
-  switch (Dim) {
-  case 0:
-    return __nvvm_read_ptx_sreg_nctaid_x();
-  case 1:
-    return __nvvm_read_ptx_sreg_nctaid_y();
-  case 2:
-    return __nvvm_read_ptx_sreg_nctaid_z();
-  };
-  UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInBlock(0) *
-         impl::getNumberOfBlocksInKernel(0) *
-         impl::getNumberOfThreadsInBlock(1) *
-         impl::getNumberOfBlocksInKernel(1) *
-         impl::getNumberOfThreadsInBlock(2) *
-         impl::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t getWarpIdInBlock() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
-  return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
-         mapping::getWarpSize();
-}
-
-#endif
-///}
-
-} // namespace impl
-} // namespace ompx
-
-/// We have to be deliberate about the distinction of `mapping::` and `impl::`
-/// below to avoid repeating assumptions or including irrelevant ones.
 ///{
 
 static bool isInLastWarp() {
@@ -231,69 +48,79 @@ bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
 }
 
 bool mapping::isLeaderInWarp() {
-  __kmpc_impl_lanemask_t Active = mapping::activemask();
-  __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
-  return utils::popc(Active & LaneMaskLT) == 0;
+  return __gpu_is_first_in_lane(__gpu_lane_mask());
 }
 
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
+LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
 
-LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+LaneMaskTy mapping::lanemaskLT() {
+#ifdef __NVPTX__
+  return __nvvm_read_ptx_sreg_lanemask_lt();
+#else
+  uint32_t Lane = __gpu_lane_id();
+  int64_t Ballot = __gpu_lane_mask();
+  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+  return Mask & Ballot;
+#endif
+}
 
-LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+LaneMaskTy mapping::lanemaskGT() {
+#ifdef __NVPTX__
+  return __nvvm_read_ptx_sreg_lanemask_gt();
+#else
+  uint32_t Lane = __gpu_lane_id();
+  if (Lane == (__gpu_num_lanes() - 1))
+    return 0;
+  int64_t Ballot = __gpu_lane_mask();
+  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+  return Mask & Ballot;
+#endif
+}
 
 uint32_t mapping::getThreadIdInWarp() {
-  uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
-  ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
+  uint32_t ThreadIdInWarp = __gpu_lane_id();
   return ThreadIdInWarp;
 }
 
 uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
-  uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
+  uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
   return ThreadIdInBlock;
 }
 
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
 
 uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
   uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
   // If we are in SPMD mode, remove one warp.
-  return BlockSize - (!IsSPMD * impl::getWarpSize());
+  return BlockSize - (!IsSPMD * __gpu_num_lanes());
 }
 uint32_t mapping::getMaxTeamThreads() {
   return mapping::getMaxTeamThreads(mapping::isSPMDMode());
 }
 
 uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
-  return impl::getNumberOfThreadsInBlock(Dim);
+  return __gpu_num_threads(Dim);
 }
 
 uint32_t mapping::getNumberOfThreadsInKernel() {
-  return impl::getNumberOfThreadsInKernel();
+  return __gpu_num_threads(mapping::DIM_X) * __gpu_num_threads(mapping::DIM_Y) *
+         __gpu_num_threads(mapping::DIM_Z);
 }
 
 uint32_t mapping::getWarpIdInBlock() {
-  uint32_t WarpID = impl::getWarpIdInBlock();
-  ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
-  return WarpID;
+  return __gpu_thread_id(mapping::DIM_X) / __gpu_num_lanes();
 }
 
 uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
-  uint32_t BlockId = impl::getBlockIdInKernel(Dim);
-  ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
-  return BlockId;
+  return __gpu_block_id(Dim);
 }
 
 uint32_t mapping::getNumberOfWarpsInBlock() {
-  uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
-  ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
-  return NumberOfWarpsInBlocks;
+  return __gpu_block_id(mapping::DIM_X) / __gpu_num_lanes();
 }
 
 uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
-  uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
-  ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
-  return NumberOfBlocks;
+  return __gpu_num_blocks(Dim);
 }
 
 uint32_t mapping::getNumberOfProcessorElements() {
@@ -308,7 +135,7 @@ uint32_t mapping::getNumberOfProcessorElements() {
 
 // TODO: This is a workaround for initialization coming from kernels outside of
 //       the TU. We will need to solve this more correctly in the future.
-[[gnu::weak]] int SHARED(IsSPMDMode);
+[[clang::loader_uninitialized, gnu::weak]] int __gpu_local IsSPMDMode;
 
 void mapping::init(bool IsSPMD) {
   if (mapping::isInitialThreadInLevel0(IsSPMD))
@@ -326,12 +153,10 @@ extern "C" {
 }
 
 [[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
-  return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
+  return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
 }
 
-[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
-  return impl::getWarpSize();
-}
+[[gnu::noinline]] uint32_t __kmpc_get_warp_size() { return __gpu_num_lanes(); }
 }
 
 #define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME)                                \
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index 734e937f039201..b1f936c4f13e21 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -32,10 +32,6 @@ double getWTick() {
   return 1.0 / config::getClockFrequency();
 }
 
-double getWTime() {
-  return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
-}
-
 #endif
 
 /// NVPTX Implementation
@@ -48,13 +44,12 @@ double getWTick() {
   return ((double)1E-9);
 }
 
+#endif
+
 double getWTime() {
-  uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
-  return static_cast<double>(nsecs) * getWTick();
+  return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
 }
 
-#endif
-
 /// Lookup a device-side function using a host pointer /p HstPtr using the table
 /// provided by the device plugin. The table is an ordered pair of host and
 /// device pointers sorted on the value of the host pointer.
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 25f34005532f7c..f78a940d9cc13a 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -196,8 +196,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   uint32_t NumThreads = omp_get_num_threads();
   uint32_t TeamId = omp_get_team_num();
   uint32_t NumTeams = omp_get_num_teams();
-  static unsigned SHARED(Bound);
-  static unsigned SHARED(ChunkTeamCount);
+  static unsigned __gpu_local Bound;
+  static unsigned __gpu_local ChunkTeamCount;
 
   // Block progress for teams greater than the current upper
   // limit. We always only allow a number of teams less or equal
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 89edb4802198c9..0981b33dfdd4dc 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -32,11 +32,13 @@ using namespace ompx;
     [[clang::address_space(3)]] DynamicSharedBuffer[];
 
 /// The kernel environment passed to the init method by the compiler.
-static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
+[[clang::loader_uninitialized]] static KernelEnvironmentTy *__gpu_local
+    KernelEnvironmentPtr;
 
 /// The kernel launch environment passed as argument to the kernel by the
 /// runtime.
-static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
+[[clang::loader_uninitialized]] static KernelLaunchEnvironmentTy *__gpu_local
+    KernelLaunchEnvironmentPtr;
 
 ///}
 
@@ -108,7 +110,8 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
               "Shared scratchpad of this size not supported yet.");
 
 /// The allocation of a single shared memory scratchpad.
-static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
+[[clang::loader_uninitialized]] static SharedMemorySmartStackTy __gpu_local
+    SharedMemorySmartStack;
 
 void SharedMemorySmartStackTy::init(bool IsSPMD) {
   Usage[mapping::getThreadIdInBlock()] = 0;
@@ -220,8 +223,10 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
   ASSERT(HasThreadState == Other.HasThreadState, nullptr);
 }
 
-state::TeamStateTy SHARED(ompx::state::TeamState);
-state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
+[[clang::loader_uninitialized]] state::TeamStateTy __gpu_local
+    ompx::state::TeamState;
+[[clang::loader_uninitialized]] state::ThreadStateTy **__gpu_local
+    ompx::state::ThreadStates;
 
 namespace {
 
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index a5090b96560c8b..28d94a484798d3 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -69,7 +69,7 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
   }
 }
 
-uint32_t SHARED(namedBarrierTracker);
+[[clang::loader_uninitialized]] static uint32_t __gpu_local namedBarrierTracker;
 
 void namedBarrierInit() {
   // Don't have global ctors, and shared memory is not zero init
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index b1f037a11bddf1..9e34b94fe4b208 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -45,7 +45,7 @@ struct DynamicScheduleTracker {
 #define LAST_CHUNK 2
 
 // TODO: This variable is a hack inherited from the old runtime.
-static uint64_t SHARED(Cnt);
+[[clang::loader_uninitialized]] static uint64_t __gpu_local Cnt;
 
 template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
   ////////////////////////////////////////////////////////////////////////////////
@@ -457,7 +457,8 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
 //
 //  __kmpc_dispatch_deinit
 //
-static DynamicScheduleTracker **SHARED(ThreadDST);
+[[clang::loader_uninitialized]] static DynamicScheduleTracker **__gpu_local
+    ThreadDST;
 
 // Create a new DST, link the current one, and define the new as current.
 static DynamicScheduleTracker *pushDST() {