[llvm] [OpenMP] Port the OpenMP device runtime to direct C++ compilation (PR #123673)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 20 13:54:17 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-offload

Author: Joseph Huber (jhuber6)

<details>
<summary>Changes</summary>

Summary:
This removes the use of OpenMP offloading to build the device runtime.
The main benefit here is that we no longer need to rely on offloading
semantics to build a device only runtime. Things like variants are now
no longer needed and can just be simple if-defs. In the future, I will
remove most of the special handling here and fold it into calls to the
`<gpuintrin.h>` functions instead. Additionally I will rework the
compilation to make this a separate runtime.

The current plan is to have this, but make including OpenMP and
offloading either automatically add it, or print a warning if it's
missing. This will allow us to use a normal CMake workflow and delete
all the weird 'lets pull the clang binary out of the build' business.
```
-DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=offload
-DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa
```

After that, linking the OpenMP device runtime will be `-Xoffload-linker
-lomp`. I.e. no more fat binary business.

Only look at the most recent commit since this includes the two dependencies
(fix to AMDGPUEmitPrintfBinding and the PointerToMember bug).


---

Patch is 34.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123673.diff


25 Files Affected:

- (modified) offload/DeviceRTL/CMakeLists.txt (+7-11) 
- (modified) offload/DeviceRTL/include/Allocator.h (-4) 
- (modified) offload/DeviceRTL/include/Debug.h (+1-6) 
- (modified) offload/DeviceRTL/include/DeviceTypes.h (+1-9) 
- (modified) offload/DeviceRTL/include/DeviceUtils.h (-4) 
- (modified) offload/DeviceRTL/include/LibC.h (+4-5) 
- (modified) offload/DeviceRTL/include/Mapping.h (-4) 
- (modified) offload/DeviceRTL/include/State.h (+39-47) 
- (modified) offload/DeviceRTL/include/Synchronization.h (-4) 
- (modified) offload/DeviceRTL/include/Workshare.h (-4) 
- (modified) offload/DeviceRTL/src/Allocator.cpp (-4) 
- (modified) offload/DeviceRTL/src/Configuration.cpp (-4) 
- (modified) offload/DeviceRTL/src/Debug.cpp (+2-6) 
- (modified) offload/DeviceRTL/src/DeviceUtils.cpp (+4-12) 
- (modified) offload/DeviceRTL/src/Kernel.cpp (-4) 
- (modified) offload/DeviceRTL/src/LibC.cpp (+18-29) 
- (modified) offload/DeviceRTL/src/Mapping.cpp (+5-25) 
- (modified) offload/DeviceRTL/src/Misc.cpp (+4-13) 
- (modified) offload/DeviceRTL/src/Parallelism.cpp (+2-5) 
- (modified) offload/DeviceRTL/src/Profiling.cpp (-4) 
- (modified) offload/DeviceRTL/src/Reduction.cpp (-4) 
- (modified) offload/DeviceRTL/src/State.cpp (+10-17) 
- (modified) offload/DeviceRTL/src/Synchronization.cpp (+4-32) 
- (modified) offload/DeviceRTL/src/Tasking.cpp (+1-5) 
- (modified) offload/DeviceRTL/src/Workshare.cpp (-4) 


``````````diff
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 099634e211e7a7..8f2a1fd01fabcc 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -95,11 +95,10 @@ set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
 list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
 
 # Set flags for LLVM Bitcode compilation.
-set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
-              ${clang_opt_flags} --offload-device-only
-             -nocudalib -nogpulib -nogpuinc -nostdlibinc
-             -fopenmp -fopenmp-cuda-mode
-             -Wno-unknown-cuda-version -Wno-openmp-target
+set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
+             ${clang_opt_flags} -nogpulib -nostdlibinc
+             -fno-rtti -fno-exceptions -fconvergent-functions
+             -Wno-unknown-cuda-version
              -DOMPTARGET_DEVICE_RUNTIME
              -I${include_directory}
              -I${devicertl_base_directory}/../include
@@ -123,8 +122,7 @@ function(compileDeviceRTLLibrary target_name target_triple)
     add_custom_command(OUTPUT ${outfile}
       COMMAND ${CLANG_TOOL}
       ${bc_flags}
-      -fopenmp-targets=${target_triple}
-      -Xopenmp-target=${target_triple} -march=
+      --target=${target_triple}
       ${target_bc_flags}
       -MD -MF ${depfile}
       ${infile} -o ${outfile}
@@ -242,10 +240,8 @@ function(compileDeviceRTLLibrary target_name target_triple)
     set(ide_target_name omptarget-ide-${target_name})
     add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
     target_compile_options(${ide_target_name} PRIVATE
-      -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
-      -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
-      -foffload-lto -fvisibility=hidden --offload-device-only
-      -nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
+      -fvisibility=hidden --target=${target_triple}
+      -nogpulib -nostdlibinc -Wno-unknown-cuda-version
     )
     target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
     target_include_directories(${ide_target_name} PRIVATE
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
index 475f6a21bb47eb..79c69a2a96b4e9 100644
--- a/offload/DeviceRTL/include/Allocator.h
+++ b/offload/DeviceRTL/include/Allocator.h
@@ -17,8 +17,6 @@
 // Forward declaration.
 struct KernelEnvironmentTy;
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 
 namespace allocator {
@@ -44,6 +42,4 @@ extern "C" {
 [[gnu::weak]] void free(void *Ptr);
 }
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/Debug.h b/offload/DeviceRTL/include/Debug.h
index 22998f44a5bea5..98d0fa498d952b 100644
--- a/offload/DeviceRTL/include/Debug.h
+++ b/offload/DeviceRTL/include/Debug.h
@@ -35,15 +35,10 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
       __assert_assume(expr);                                                   \
   }
 #define UNREACHABLE(msg)                                                       \
-  PRINT(msg);                                                                  \
+  printf(msg);                                                                 \
   __builtin_trap();                                                            \
   __builtin_unreachable();
 
 ///}
 
-#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
-#define PRINT(str) PRINTF("%s", str)
-
-///}
-
 #endif
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 1cd044f432e569..308109b0749f05 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -99,14 +99,7 @@ struct TaskDescriptorTy {
   TaskFnTy TaskFn;
 };
 
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
 using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match(                                       \
-        device = {arch(amdgcn)}, implementation = {extension(match_none)})
-using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
 
 namespace lanes {
 enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
@@ -163,8 +156,7 @@ typedef enum omp_allocator_handle_t {
 #define OMP_PRAGMA(STR) __PRAGMA(omp STR)
 
 #define SHARED(NAME)                                                           \
-  NAME [[clang::loader_uninitialized]];                                        \
-  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+  [[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
 
 // TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
 //       now that's not the case.
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
index 2243673aef61c7..b92514ee9838a1 100644
--- a/offload/DeviceRTL/include/DeviceUtils.h
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -15,8 +15,6 @@
 #include "DeviceTypes.h"
 #include "Shared/Utils.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace utils {
 
 template <typename T> struct type_identity {
@@ -95,6 +93,4 @@ bool isThreadLocalMemPtr(void *Ptr);
 
 } // namespace utils
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
index 03febdb5083423..94b5e651960674 100644
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -14,11 +14,10 @@
 
 #include "DeviceTypes.h"
 
-extern "C" {
+namespace ompx {
 
-int memcmp(const void *lhs, const void *rhs, size_t count);
-void memset(void *dst, int C, size_t count);
-int printf(const char *format, ...);
-}
+int printf(const char *Format, ...);
+
+} // namespace ompx
 
 #endif
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 2fb87abe5418c0..c53ddbfd0d1004 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -24,12 +24,8 @@ enum {
   DIM_Z = 2,
 };
 
-#pragma omp begin declare target device_type(nohost)
-
 inline constexpr uint32_t MaxThreadsPerTeam = 1024;
 
-#pragma omp end declare target
-
 /// Initialize the mapping machinery.
 void init(bool IsSPMD);
 
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index 565235cd48a913..fbf3b751d2f63d 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -22,8 +22,6 @@
 // Forward declaration.
 struct KernelEnvironmentTy;
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 
 namespace memory {
@@ -88,8 +86,7 @@ struct TeamStateTy {
   ParallelRegionFnTy ParallelRegionFnVar;
 };
 
-extern TeamStateTy TeamState;
-#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
+extern TeamStateTy [[clang::address_space(3)]] TeamState;
 
 struct ThreadStateTy {
 
@@ -115,8 +112,7 @@ struct ThreadStateTy {
   }
 };
 
-extern ThreadStateTy **ThreadStates;
-#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
+extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
 
 /// Initialize the state machinery. Must be called by all threads.
 void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
@@ -158,63 +154,61 @@ struct DateEnvironmentRAII {
 /// TODO
 void resetStateForThread(uint32_t TId);
 
-inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var,
-                                       IdentTy *Ident, bool ForceTeamState) {
-  if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||
-                 !TeamState.HasThreadState))
-    return TeamState.ICVState.*Var;
-  uint32_t TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(!ThreadStates[TId])) {
-    ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
-        sizeof(ThreadStateTy), "ICV modification outside data environment"));
-    ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!");
-    TeamState.HasThreadState = true;
-    ThreadStates[TId]->init();
+// FIXME: https://github.com/llvm/llvm-project/issues/123241.
+#define lookupForModify32Impl(Member, Ident, ForceTeamState)                   \
+  {                                                                            \
+    if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||          \
+                   !TeamState.HasThreadState))                                 \
+      return TeamState.ICVState.Member;                                        \
+    uint32_t TId = mapping::getThreadIdInBlock();                              \
+    if (OMP_UNLIKELY(!ThreadStates[TId])) {                                    \
+      ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(                   \
+          memory::allocGlobal(sizeof(ThreadStateTy),                           \
+                              "ICV modification outside data environment"));   \
+      ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!");     \
+      TeamState.HasThreadState = true;                                         \
+      ThreadStates[TId]->init();                                               \
+    }                                                                          \
+    return ThreadStates[TId]->ICVState.Member;                                 \
   }
-  return ThreadStates[TId]->ICVState.*Var;
-}
 
-inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var,
-                            bool ForceTeamState) {
-  auto TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&
-                   TeamState.HasThreadState && ThreadStates[TId]))
-    return ThreadStates[TId]->ICVState.*Var;
-  return TeamState.ICVState.*Var;
-}
+// FIXME: https://github.com/llvm/llvm-project/issues/123241.
+#define lookupImpl(Member, ForceTeamState)                                     \
+  {                                                                            \
+    auto TId = mapping::getThreadIdInBlock();                                  \
+    if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&        \
+                     TeamState.HasThreadState && ThreadStates[TId]))           \
+      return ThreadStates[TId]->ICVState.Member;                               \
+    return TeamState.ICVState.Member;                                          \
+  }
 
 [[gnu::always_inline, gnu::flatten]] inline uint32_t &
 lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
   switch (Kind) {
   case state::VK_NThreads:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::NThreadsVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(NThreadsVar, ForceTeamState);
+    lookupForModify32Impl(NThreadsVar, Ident, ForceTeamState);
   case state::VK_Level:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::LevelVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident, ForceTeamState);
+      lookupImpl(LevelVar, ForceTeamState);
+    lookupForModify32Impl(LevelVar, Ident, ForceTeamState);
   case state::VK_ActiveLevel:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::ActiveLevelVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(ActiveLevelVar, ForceTeamState);
+    lookupForModify32Impl(ActiveLevelVar, Ident, ForceTeamState);
   case state::VK_MaxActiveLevels:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::MaxActiveLevelsVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(MaxActiveLevelsVar, ForceTeamState);
+    lookupForModify32Impl(MaxActiveLevelsVar, Ident, ForceTeamState);
   case state::VK_RunSched:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::RunSchedVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(RunSchedVar, ForceTeamState);
+    lookupForModify32Impl(RunSchedVar, Ident, ForceTeamState);
   case state::VK_RunSchedChunk:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::RunSchedChunkVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(RunSchedChunkVar, ForceTeamState);
+    lookupForModify32Impl(RunSchedChunkVar, Ident, ForceTeamState);
   case state::VK_ParallelTeamSize:
     return TeamState.ParallelTeamSize;
   case state::VK_HasThreadState:
@@ -380,6 +374,4 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;
 
 } // namespace ompx
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index 3205582e87b998..30b5da680f87f4 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -15,8 +15,6 @@
 #include "DeviceTypes.h"
 #include "DeviceUtils.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 namespace atomic {
 
@@ -219,6 +217,4 @@ void system(atomic::OrderingTy Ordering);
 
 } // namespace ompx
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h
index fa9b3b2430b8c4..554c3271c334c0 100644
--- a/offload/DeviceRTL/include/Workshare.h
+++ b/offload/DeviceRTL/include/Workshare.h
@@ -12,8 +12,6 @@
 #ifndef OMPTARGET_WORKSHARE_H
 #define OMPTARGET_WORKSHARE_H
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 
 namespace workshare {
@@ -25,6 +23,4 @@ void init(bool IsSPMD);
 
 } // namespace ompx
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
index ac662c48d4f5fb..aac2a6005158ef 100644
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ b/offload/DeviceRTL/src/Allocator.cpp
@@ -19,8 +19,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 [[gnu::used, gnu::retain, gnu::weak,
   gnu::visibility(
       "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
@@ -77,5 +75,3 @@ void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
 void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
 
 ///}
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index 9e14c203d4a04e..0ffab8bd186116 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -17,8 +17,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 // Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
 [[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
 [[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
@@ -85,5 +83,3 @@ bool config::mayUseNestedParallelism() {
     return false;
   return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index b451f17c6bbd89..5b5482d766b1d0 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -21,8 +21,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 extern "C" {
 void __assert_assume(bool condition) { __builtin_assume(condition); }
 
@@ -36,13 +34,11 @@ void __assert_assume(bool condition) { __builtin_assume(condition); }
 void __assert_fail_internal(const char *expr, const char *msg, const char *file,
                             unsigned line, const char *function) {
   if (msg) {
-    PRINTF("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function,
+    printf("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function,
            msg, expr);
   } else {
-    PRINTF("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr);
+    printf("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr);
   }
   __builtin_trap();
 }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
index c204a7be73b1fc..d8109537832e96 100644
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -15,14 +15,10 @@
 #include "Interface.h"
 #include "Mapping.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 using namespace ompx;
 
 namespace impl {
 
-bool isSharedMemPtr(const void *Ptr) { return false; }
-
 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
   static_assert(sizeof(unsigned long) == 8, "");
   *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
@@ -42,7 +38,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
 /// AMDGCN Implementation
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
 
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   int Self = mapping::getThreadIdInWarp();
@@ -66,15 +62,13 @@ bool isSharedMemPtr(const void *Ptr) {
   return __builtin_amdgcn_is_shared(
       (const __attribute__((address_space(0))) void *)Ptr);
 }
-#pragma omp end declare variant
+#endif
 ///}
 
 /// NVPTX Implementation
 ///
 ///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
+#ifdef __NVPTX__
 
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
@@ -91,7 +85,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
 
 bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
 
-#pragma omp end declare variant
+#endif
 ///}
 } // namespace impl
 
@@ -137,5 +131,3 @@ int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
   return utils::shuffleDown(lanes::All, Val, Delta, Width);
 }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 8bb275eae776c6..9bb89573dc0cb8 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -25,8 +25,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 static void
 inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
                     KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
@@ -155,5 +153,3 @@ void __kmpc_target_deinit() {
 
 int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
index 291ceb023a69c5..83f9233d948032 100644
--- a/offload/DeviceRTL/src/LibC.cpp
+++ b/offload/DeviceRTL/src/LibC.cpp
@@ -8,34 +8,11 @@
 
 #include "LibC.h"
 
-#pragma omp begin declare target device_type(nohost)
-
-namespace impl {
-int32_t omp_vprintf(const char *Format, __builtin_va_list vlist);
-}
-
-#ifndef OMPTARGET_HAS_LIBC
-namespace impl {
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
-extern "C" int vprintf(const char *format, ...);
-int omp_vprintf(const char *Format, __builtin_va_list vlist) {
-  return vprintf(Format, vlist);
-}
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-int omp_vprintf(const char *Format, __builtin_va_list) { return -1; }
-#pragma omp end declare variant
-} // namespace impl
-
-extern "C" int printf(const char *Format, ...) {
-  __builtin_va_list vlist;
-  __builtin_va_start(vlist, Format);
-  return impl::omp_vprintf(Format, vlist);
-}
-#endif // OMPTARGET_HAS_LIBC
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+ext...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/123673


More information about the llvm-commits mailing list