[llvm] [OpenMP] Replace most GPU helpers with ones from <gpuintrin.h> (PR #125771)
Joseph Huber via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 5 13:13:16 PST 2025
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/125771
>From 89047f2766fb59f7ce108032b4d2f5cd95237873 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 4 Feb 2025 16:15:35 -0600
Subject: [PATCH] [OpenMP] Replace most GPU helpers with ones from
<gpuintrin.h>
Summary:
This patch cleans up the runtime by using the definitions from
`<gpuintrin.h>` instead. This reduces complexity and makes it easier to
port. I have left a handful leftover, atomicInc, shuffle, and the sleep
calls. These are not easily replaced but I will work on it.
---
offload/DeviceRTL/include/DeviceTypes.h | 14 +-
offload/DeviceRTL/src/Configuration.cpp | 4 +-
offload/DeviceRTL/src/DeviceUtils.cpp | 100 ++-------
offload/DeviceRTL/src/Mapping.cpp | 255 ++++------------------
offload/DeviceRTL/src/Misc.cpp | 11 +-
offload/DeviceRTL/src/Reduction.cpp | 4 +-
offload/DeviceRTL/src/State.cpp | 15 +-
offload/DeviceRTL/src/Synchronization.cpp | 2 +-
offload/DeviceRTL/src/Workshare.cpp | 5 +-
9 files changed, 78 insertions(+), 332 deletions(-)
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 308109b0749f05..395d72eafbf405 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -12,6 +12,7 @@
#ifndef OMPTARGET_TYPES_H
#define OMPTARGET_TYPES_H
+#include <gpuintrin.h>
#include <stddef.h>
#include <stdint.h>
@@ -155,19 +156,6 @@ typedef enum omp_allocator_handle_t {
#define __PRAGMA(STR) _Pragma(#STR)
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
-#define SHARED(NAME) \
- [[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
-
-// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
-// now that's not the case.
-#define THREAD_LOCAL(NAME) \
- [[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
-
-// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
-// does?
-#define CONSTANT(NAME) \
- [[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
-
///}
#endif
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index a2dfa4a02a0947..796e9ee254f3ac 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -28,8 +28,8 @@ using namespace ompx;
// This variable should be visible to the plugin so we override the default
// hidden visibility.
[[gnu::used, gnu::retain, gnu::weak,
- gnu::visibility("protected")]] DeviceEnvironmentTy
- CONSTANT(__omp_rtl_device_environment);
+ gnu::visibility("protected")]] DeviceEnvironmentTy __gpu_constant
+ __omp_rtl_device_environment;
uint32_t config::getAssumeTeamsOversubscription() {
return __omp_rtl_assume_teams_oversubscription;
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
index d8109537832e96..c53a6b3b11c5a9 100644
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -15,116 +15,48 @@
#include "Interface.h"
#include "Mapping.h"
-using namespace ompx;
-
-namespace impl {
-
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
- static_assert(sizeof(unsigned long) == 8, "");
- *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
- *HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
-}
-
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
- return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
-}
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width);
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
- int32_t Width);
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred);
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
- int Self = mapping::getThreadIdInWarp();
- int Index = SrcLane + (Self & ~(Width - 1));
- return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
- int32_t Width) {
- int Self = mapping::getThreadIdInWarp();
- int Index = Self + LaneDelta;
- Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
- return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
+#include <gpuintrin.h>
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
- return Mask & __builtin_amdgcn_ballot_w64(Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) {
- return __builtin_amdgcn_is_shared(
- (const __attribute__((address_space(0))) void *)Ptr);
-}
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
- return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
-}
-
-int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
- int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
- return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-}
-
-uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
- return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
-}
-
-bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
-
-#endif
-///}
-} // namespace impl
+using namespace ompx;
uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
- return impl::Pack(LowBits, HighBits);
+ return (uint64_t(HighBits) << 32) | uint64_t(LowBits);
}
void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
- impl::Unpack(Val, &LowBits, &HighBits);
+ static_assert(sizeof(unsigned long) == 8, "");
+ LowBits = static_cast<uint32_t>(Val & 0x00000000fffffffful);
+ HighBits = static_cast<uint32_t>((Val & 0xffffffff00000000ul) >> 32);
}
int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane,
int32_t Width) {
- return impl::shuffle(Mask, Var, SrcLane, Width);
+ return __gpu_shuffle_idx_u32(Mask, Var, SrcLane, Width);
}
int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
int32_t Width) {
- return impl::shuffleDown(Mask, Var, Delta, Width);
+ int32_t Self = mapping::getThreadIdInWarp();
+ int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
+ return __gpu_shuffle_idx_u32(Mask, Index, Var, Width);
}
int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta,
int32_t Width) {
- uint32_t Lo, Hi;
- utils::unpack(Var, Lo, Hi);
- Hi = impl::shuffleDown(Mask, Hi, Delta, Width);
- Lo = impl::shuffleDown(Mask, Lo, Delta, Width);
- return utils::pack(Lo, Hi);
+ int32_t Self = mapping::getThreadIdInWarp();
+ int32_t Index = (Delta + (Self & (Width - 1))) >= Width ? Self : Self + Delta;
+ return __gpu_shuffle_idx_u64(Mask, Index, Var, Width);
}
uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
- return impl::ballotSync(Mask, Pred);
+ return __gpu_ballot(Mask, Pred);
}
-bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
+bool utils::isSharedMemPtr(void *Ptr) { return __gpu_is_ptr_local(Ptr); }
extern "C" {
int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
- return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
+ return utils::shuffleDown(lanes::All, Val, Delta, SrcLane);
}
int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index a0c0f6721a84cc..8929692114e61e 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -15,195 +15,12 @@
#include "Interface.h"
#include "State.h"
+#include <gpuintrin.h>
+
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
using namespace ompx;
-namespace ompx {
-namespace impl {
-
-/// AMDGCN Implementation
-///
-///{
-#ifdef __AMDGPU__
-
-uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workgroup_size_x();
- case 1:
- return __builtin_amdgcn_workgroup_size_y();
- case 2:
- return __builtin_amdgcn_workgroup_size_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
-
-LaneMaskTy lanemaskLT() {
- uint32_t Lane = mapping::getThreadIdInWarp();
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
- return Mask & Ballot;
-}
-
-LaneMaskTy lanemaskGT() {
- uint32_t Lane = mapping::getThreadIdInWarp();
- if (Lane == (mapping::getWarpSize() - 1))
- return 0;
- int64_t Ballot = mapping::activemask();
- uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
- return Mask & Ballot;
-}
-
-uint32_t getThreadIdInWarp() {
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workitem_id_x();
- case 1:
- return __builtin_amdgcn_workitem_id_y();
- case 2:
- return __builtin_amdgcn_workitem_id_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
- return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
- __builtin_amdgcn_grid_size_z();
-}
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_workgroup_id_x();
- case 1:
- return __builtin_amdgcn_workgroup_id_y();
- case 2:
- return __builtin_amdgcn_workgroup_id_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
- case 1:
- return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
- case 2:
- return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpIdInBlock() {
- return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
- return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
-}
-
-#endif
-///}
-
-/// NVPTX Implementation
-///
-///{
-#ifdef __NVPTX__
-
-uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_ntid_x();
- case 1:
- return __nvvm_read_ptx_sreg_ntid_y();
- case 2:
- return __nvvm_read_ptx_sreg_ntid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
-
-LaneMaskTy activemask() { return __nvvm_activemask(); }
-
-LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-
-LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
-
-uint32_t getThreadIdInBlock(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_tid_x();
- case 1:
- return __nvvm_read_ptx_sreg_tid_y();
- case 2:
- return __nvvm_read_ptx_sreg_tid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
-
-uint32_t getBlockIdInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_ctaid_x();
- case 1:
- return __nvvm_read_ptx_sreg_ctaid_y();
- case 2:
- return __nvvm_read_ptx_sreg_ctaid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
- switch (Dim) {
- case 0:
- return __nvvm_read_ptx_sreg_nctaid_x();
- case 1:
- return __nvvm_read_ptx_sreg_nctaid_y();
- case 2:
- return __nvvm_read_ptx_sreg_nctaid_z();
- };
- UNREACHABLE("Dim outside range!");
-}
-
-uint32_t getNumberOfThreadsInKernel() {
- return impl::getNumberOfThreadsInBlock(0) *
- impl::getNumberOfBlocksInKernel(0) *
- impl::getNumberOfThreadsInBlock(1) *
- impl::getNumberOfBlocksInKernel(1) *
- impl::getNumberOfThreadsInBlock(2) *
- impl::getNumberOfBlocksInKernel(2);
-}
-
-uint32_t getWarpIdInBlock() {
- return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
-}
-
-uint32_t getNumberOfWarpsInBlock() {
- return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
- mapping::getWarpSize();
-}
-
-#endif
-///}
-
-} // namespace impl
-} // namespace ompx
-
-/// We have to be deliberate about the distinction of `mapping::` and `impl::`
-/// below to avoid repeating assumptions or including irrelevant ones.
///{
static bool isInLastWarp() {
@@ -231,69 +48,79 @@ bool mapping::isInitialThreadInLevel0(bool IsSPMD) {
}
bool mapping::isLeaderInWarp() {
- __kmpc_impl_lanemask_t Active = mapping::activemask();
- __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
- return utils::popc(Active & LaneMaskLT) == 0;
+ return __gpu_is_first_in_lane(__gpu_lane_mask());
}
-LaneMaskTy mapping::activemask() { return impl::activemask(); }
+LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
-LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+LaneMaskTy mapping::lanemaskLT() {
+#ifdef __NVPTX__
+ return __nvvm_read_ptx_sreg_lanemask_lt();
+#else
+ uint32_t Lane = __gpu_lane_id();
+ int64_t Ballot = __gpu_lane_mask();
+ uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+ return Mask & Ballot;
+#endif
+}
-LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+LaneMaskTy mapping::lanemaskGT() {
+#ifdef __NVPTX__
+ return __nvvm_read_ptx_sreg_lanemask_gt();
+#else
+ uint32_t Lane = __gpu_lane_id();
+ if (Lane == (__gpu_num_lanes() - 1))
+ return 0;
+ int64_t Ballot = __gpu_lane_mask();
+ uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+ return Mask & Ballot;
+#endif
+}
uint32_t mapping::getThreadIdInWarp() {
- uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
- ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
+ uint32_t ThreadIdInWarp = __gpu_lane_id();
return ThreadIdInWarp;
}
uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
- uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
+ uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
return ThreadIdInBlock;
}
-uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
// If we are in SPMD mode, remove one warp.
- return BlockSize - (!IsSPMD * impl::getWarpSize());
+ return BlockSize - (!IsSPMD * __gpu_num_lanes());
}
uint32_t mapping::getMaxTeamThreads() {
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
}
uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
- return impl::getNumberOfThreadsInBlock(Dim);
+ return __gpu_num_threads(Dim);
}
uint32_t mapping::getNumberOfThreadsInKernel() {
- return impl::getNumberOfThreadsInKernel();
+ return __gpu_num_threads(mapping::DIM_X) * __gpu_num_threads(mapping::DIM_Y) *
+ __gpu_num_threads(mapping::DIM_Z);
}
uint32_t mapping::getWarpIdInBlock() {
- uint32_t WarpID = impl::getWarpIdInBlock();
- ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
- return WarpID;
+ return __gpu_thread_id(mapping::DIM_X) / __gpu_num_lanes();
}
uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
- uint32_t BlockId = impl::getBlockIdInKernel(Dim);
- ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
- return BlockId;
+ return __gpu_block_id(Dim);
}
uint32_t mapping::getNumberOfWarpsInBlock() {
- uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
- ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
- return NumberOfWarpsInBlocks;
+ return __gpu_block_id(mapping::DIM_X) / __gpu_num_lanes();
}
uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
- uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
- ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
- return NumberOfBlocks;
+ return __gpu_num_blocks(Dim);
}
uint32_t mapping::getNumberOfProcessorElements() {
@@ -308,7 +135,7 @@ uint32_t mapping::getNumberOfProcessorElements() {
// TODO: This is a workaround for initialization coming from kernels outside of
// the TU. We will need to solve this more correctly in the future.
-[[gnu::weak]] int SHARED(IsSPMDMode);
+[[clang::loader_uninitialized, gnu::weak]] int __gpu_local IsSPMDMode;
void mapping::init(bool IsSPMD) {
if (mapping::isInitialThreadInLevel0(IsSPMD))
@@ -326,12 +153,10 @@ extern "C" {
}
[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
- return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
+ return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
}
-[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
- return impl::getWarpSize();
-}
+[[gnu::noinline]] uint32_t __kmpc_get_warp_size() { return __gpu_num_lanes(); }
}
#define _TGT_KERNEL_LANGUAGE(NAME, MAPPER_NAME) \
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index 734e937f039201..b1f936c4f13e21 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -32,10 +32,6 @@ double getWTick() {
return 1.0 / config::getClockFrequency();
}
-double getWTime() {
- return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
-}
-
#endif
/// NVPTX Implementation
@@ -48,13 +44,12 @@ double getWTick() {
return ((double)1E-9);
}
+#endif
+
double getWTime() {
- uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
- return static_cast<double>(nsecs) * getWTick();
+ return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
}
-#endif
-
/// Lookup a device-side function using a host pointer /p HstPtr using the table
/// provided by the device plugin. The table is an ordered pair of host and
/// device pointers sorted on the value of the host pointer.
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 25f34005532f7c..f78a940d9cc13a 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -196,8 +196,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
uint32_t NumThreads = omp_get_num_threads();
uint32_t TeamId = omp_get_team_num();
uint32_t NumTeams = omp_get_num_teams();
- static unsigned SHARED(Bound);
- static unsigned SHARED(ChunkTeamCount);
+ static unsigned __gpu_local Bound;
+ static unsigned __gpu_local ChunkTeamCount;
// Block progress for teams greater than the current upper
// limit. We always only allow a number of teams less or equal
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 89edb4802198c9..0981b33dfdd4dc 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -32,11 +32,13 @@ using namespace ompx;
[[clang::address_space(3)]] DynamicSharedBuffer[];
/// The kernel environment passed to the init method by the compiler.
-static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
+[[clang::loader_uninitialized]] static KernelEnvironmentTy *__gpu_local
+ KernelEnvironmentPtr;
/// The kernel launch environment passed as argument to the kernel by the
/// runtime.
-static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
+[[clang::loader_uninitialized]] static KernelLaunchEnvironmentTy *__gpu_local
+ KernelLaunchEnvironmentPtr;
///}
@@ -108,7 +110,8 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
"Shared scratchpad of this size not supported yet.");
/// The allocation of a single shared memory scratchpad.
-static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
+[[clang::loader_uninitialized]] static SharedMemorySmartStackTy __gpu_local
+ SharedMemorySmartStack;
void SharedMemorySmartStackTy::init(bool IsSPMD) {
Usage[mapping::getThreadIdInBlock()] = 0;
@@ -220,8 +223,10 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
ASSERT(HasThreadState == Other.HasThreadState, nullptr);
}
-state::TeamStateTy SHARED(ompx::state::TeamState);
-state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
+[[clang::loader_uninitialized]] state::TeamStateTy __gpu_local
+ ompx::state::TeamState;
+[[clang::loader_uninitialized]] state::ThreadStateTy **__gpu_local
+ ompx::state::ThreadStates;
namespace {
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index a5090b96560c8b..28d94a484798d3 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -69,7 +69,7 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
}
}
-uint32_t SHARED(namedBarrierTracker);
+[[clang::loader_uninitialized]] static uint32_t __gpu_local namedBarrierTracker;
void namedBarrierInit() {
// Don't have global ctors, and shared memory is not zero init
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index b1f037a11bddf1..9e34b94fe4b208 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -45,7 +45,7 @@ struct DynamicScheduleTracker {
#define LAST_CHUNK 2
// TODO: This variable is a hack inherited from the old runtime.
-static uint64_t SHARED(Cnt);
+[[clang::loader_uninitialized]] static uint64_t __gpu_local Cnt;
template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
////////////////////////////////////////////////////////////////////////////////
@@ -457,7 +457,8 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
//
// __kmpc_dispatch_deinit
//
-static DynamicScheduleTracker **SHARED(ThreadDST);
+[[clang::loader_uninitialized]] static DynamicScheduleTracker **__gpu_local
+ ThreadDST;
// Create a new DST, link the current one, and define the new as current.
static DynamicScheduleTracker *pushDST() {
More information about the llvm-commits
mailing list