[llvm] [OpenMP] Replace use of target address space with <gpuintrin.h> local (PR #126119)

Joseph Huber via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 6 12:08:24 PST 2025


https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/126119

Summary:
This definition is more portable since it defines the correct value for
the target. I got rid of the helper mostly because I think it's easy
enough to use now that it's a type and being explicit about what's
`undef` or `poison` is good.


>From 5c612ef3fe7e76155595bdd5de30a2cf0153de88 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 6 Feb 2025 14:06:07 -0600
Subject: [PATCH] [OpenMP] Replace use of target address space with
 <gpuintrin.h> local

Summary:
This definition is more portable since it defines the correct value for
the target. I got rid of the helper mostly because I think it's easy
enough to use now that it's a type and being explicit about what's
`undef` or `poison` is good.
---
 offload/DeviceRTL/include/DeviceTypes.h   | 14 +------------
 offload/DeviceRTL/src/Configuration.cpp   |  4 ++--
 offload/DeviceRTL/src/Mapping.cpp         |  2 +-
 offload/DeviceRTL/src/Reduction.cpp       | 24 +++++++++++------------
 offload/DeviceRTL/src/State.cpp           | 15 +++++++++-----
 offload/DeviceRTL/src/Synchronization.cpp |  2 +-
 offload/DeviceRTL/src/Workshare.cpp       |  5 +++--
 7 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 308109b0749f05b..395d72eafbf4054 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -12,6 +12,7 @@
 #ifndef OMPTARGET_TYPES_H
 #define OMPTARGET_TYPES_H
 
+#include <gpuintrin.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -155,19 +156,6 @@ typedef enum omp_allocator_handle_t {
 #define __PRAGMA(STR) _Pragma(#STR)
 #define OMP_PRAGMA(STR) __PRAGMA(omp STR)
 
-#define SHARED(NAME)                                                           \
-  [[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
-
-// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
-//       now that's not the case.
-#define THREAD_LOCAL(NAME)                                                     \
-  [[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
-
-// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
-//       does?
-#define CONSTANT(NAME)                                                         \
-  [[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
-
 ///}
 
 #endif
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index a2dfa4a02a09478..796e9ee254f3ac8 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -28,8 +28,8 @@ using namespace ompx;
 // This variable should be visible to the plugin so we override the default
 // hidden visibility.
 [[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceEnvironmentTy
-    CONSTANT(__omp_rtl_device_environment);
+  gnu::visibility("protected")]] DeviceEnvironmentTy __gpu_constant
+    __omp_rtl_device_environment;
 
 uint32_t config::getAssumeTeamsOversubscription() {
   return __omp_rtl_assume_teams_oversubscription;
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index a0c0f6721a84cce..bce62ec9546bc6a 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -308,7 +308,7 @@ uint32_t mapping::getNumberOfProcessorElements() {
 
 // TODO: This is a workaround for initialization coming from kernels outside of
 //       the TU. We will need to solve this more correctly in the future.
-[[gnu::weak]] int SHARED(IsSPMDMode);
+[[clang::loader_uninitialized]] [[gnu::weak]] int __gpu_local IsSPMDMode;
 
 void mapping::init(bool IsSPMD) {
   if (mapping::isInitialThreadInLevel0(IsSPMD))
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index 25f34005532f7c6..3b5f911451cb7fa 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -71,16 +71,16 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
   if (NumThreads == 1)
     return 1;
 
-    //
-    // This reduce function handles reduction within a team. It handles
-    // parallel regions in both L1 and L2 parallelism levels. It also
-    // supports Generic, SPMD, and NoOMP modes.
-    //
-    // 1. Reduce within a warp.
-    // 2. Warp master copies value to warp 0 via shared memory.
-    // 3. Warp 0 reduces to a single value.
-    // 4. The reduced value is available in the thread that returns 1.
-    //
+  //
+  // This reduce function handles reduction within a team. It handles
+  // parallel regions in both L1 and L2 parallelism levels. It also
+  // supports Generic, SPMD, and NoOMP modes.
+  //
+  // 1. Reduce within a warp.
+  // 2. Warp master copies value to warp 0 via shared memory.
+  // 3. Warp 0 reduces to a single value.
+  // 4. The reduced value is available in the thread that returns 1.
+  //
 
 #if __has_builtin(__nvvm_reflect)
   if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
@@ -196,8 +196,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   uint32_t NumThreads = omp_get_num_threads();
   uint32_t TeamId = omp_get_team_num();
   uint32_t NumTeams = omp_get_num_teams();
-  static unsigned SHARED(Bound);
-  static unsigned SHARED(ChunkTeamCount);
+  [[clang::loader_uninitialized]] static unsigned __gpu_local Bound;
+  [[clang::loader_uninitialized]] static unsigned __gpu_local ChunkTeamCount;
 
   // Block progress for teams greater than the current upper
   // limit. We always only allow a number of teams less or equal
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 89edb4802198c9c..0981b33dfdd4dce 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -32,11 +32,13 @@ using namespace ompx;
     [[clang::address_space(3)]] DynamicSharedBuffer[];
 
 /// The kernel environment passed to the init method by the compiler.
-static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
+[[clang::loader_uninitialized]] static KernelEnvironmentTy *__gpu_local
+    KernelEnvironmentPtr;
 
 /// The kernel launch environment passed as argument to the kernel by the
 /// runtime.
-static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
+[[clang::loader_uninitialized]] static KernelLaunchEnvironmentTy *__gpu_local
+    KernelLaunchEnvironmentPtr;
 
 ///}
 
@@ -108,7 +110,8 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
               "Shared scratchpad of this size not supported yet.");
 
 /// The allocation of a single shared memory scratchpad.
-static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
+[[clang::loader_uninitialized]] static SharedMemorySmartStackTy __gpu_local
+    SharedMemorySmartStack;
 
 void SharedMemorySmartStackTy::init(bool IsSPMD) {
   Usage[mapping::getThreadIdInBlock()] = 0;
@@ -220,8 +223,10 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
   ASSERT(HasThreadState == Other.HasThreadState, nullptr);
 }
 
-state::TeamStateTy SHARED(ompx::state::TeamState);
-state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
+[[clang::loader_uninitialized]] state::TeamStateTy __gpu_local
+    ompx::state::TeamState;
+[[clang::loader_uninitialized]] state::ThreadStateTy **__gpu_local
+    ompx::state::ThreadStates;
 
 namespace {
 
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index a5090b96560c8bc..55d6d75dfa7e61c 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -69,7 +69,7 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
   }
 }
 
-uint32_t SHARED(namedBarrierTracker);
+[[clang::loader_uninitialized]] uint32_t __gpu_local namedBarrierTracker;
 
 void namedBarrierInit() {
   // Don't have global ctors, and shared memory is not zero init
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index b1f037a11bddf1f..9e34b94fe4b208c 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -45,7 +45,7 @@ struct DynamicScheduleTracker {
 #define LAST_CHUNK 2
 
 // TODO: This variable is a hack inherited from the old runtime.
-static uint64_t SHARED(Cnt);
+[[clang::loader_uninitialized]] static uint64_t __gpu_local Cnt;
 
 template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
   ////////////////////////////////////////////////////////////////////////////////
@@ -457,7 +457,8 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
 //
 //  __kmpc_dispatch_deinit
 //
-static DynamicScheduleTracker **SHARED(ThreadDST);
+[[clang::loader_uninitialized]] static DynamicScheduleTracker **__gpu_local
+    ThreadDST;
 
 // Create a new DST, link the current one, and define the new as current.
 static DynamicScheduleTracker *pushDST() {



More information about the llvm-commits mailing list