[llvm] [OpenMP] Port the OpenMP device runtime to direct C++ compilation (PR #123673)

Mon Jan 20 13:53:43 PST 2025

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/123673

Summary:
This removes the use of OpenMP offloading to build the device runtime.
The main benefit here is that we no longer need to rely on offloading
semantics to build a device only runtime. Things like variants are now
no longer needed and can just be simple if-defs. In the future, I will
remove most of the special handling here and fold it into calls to the
`<gpuintrin.h>` functions instead. Additionally I will rework the
compilation to make this a separate runtime.

The current plan is to have this, but make including OpenMP and
offloading either automatically add it, or print a warning if it's
missing. This will allow us to use a normal CMake workflow and delete
all the weird 'lets pull the clang binary out of the build' business.
```
-DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=offload
-DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa
```

After that, linking the OpenMP device runtime will be `-Xoffload-linker
-lomp`. I.e. no more fat binary business.

Only look at the most recent commit since this includes the two dependencies
(fix to AMDGPUEmitPrintfBinding and the PointerToMember bug).


>From e663bc98d19b80519460788bd5793316259cfc8c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 20 Jan 2025 15:20:59 -0600
Subject: [PATCH 1/3] [OpenMP] Adjust 'printf' handling in the OpenMP runtime

Summary:
We used to avoid a lot of this stuff because we didn't properly handle
variadics in device code. That's been solved for now, so we can just
make an internal printf handler that forwards to the external `vprintf`
function. This is either provided by NVIDIA's SDK or by the GPU libc
implementation.

The main reason for doing this is because it prevents the stupid AMDGPU
printf pass from mangling our beautiful printfs!
---
 offload/DeviceRTL/include/Debug.h     |  7 +----
 offload/DeviceRTL/include/LibC.h      |  9 +++---
 offload/DeviceRTL/src/Debug.cpp       |  4 +--
 offload/DeviceRTL/src/LibC.cpp        | 45 +++++++++++----------------
 offload/DeviceRTL/src/Parallelism.cpp |  3 +-
 offload/DeviceRTL/src/State.cpp       |  8 ++---
 6 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/offload/DeviceRTL/include/Debug.h b/offload/DeviceRTL/include/Debug.h
index 22998f44a5bea5..98d0fa498d952b 100644
--- a/offload/DeviceRTL/include/Debug.h
+++ b/offload/DeviceRTL/include/Debug.h
@@ -35,15 +35,10 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
       __assert_assume(expr);                                                   \
   }
 #define UNREACHABLE(msg)                                                       \
-  PRINT(msg);                                                                  \
+  printf(msg);                                                                 \
   __builtin_trap();                                                            \
   __builtin_unreachable();
 
 ///}
 
-#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
-#define PRINT(str) PRINTF("%s", str)
-
-///}
-
 #endif
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
index 03febdb5083423..94b5e651960674 100644
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -14,11 +14,10 @@
 
 #include "DeviceTypes.h"
 
-extern "C" {
+namespace ompx {
 
-int memcmp(const void *lhs, const void *rhs, size_t count);
-void memset(void *dst, int C, size_t count);
-int printf(const char *format, ...);
-}
+int printf(const char *Format, ...);
+
+} // namespace ompx
 
 #endif
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index b451f17c6bbd89..1d9c9628854222 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -36,10 +36,10 @@ void __assert_assume(bool condition) { __builtin_assume(condition); }
 void __assert_fail_internal(const char *expr, const char *msg, const char *file,
                             unsigned line, const char *function) {
   if (msg) {
-    PRINTF("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function,
+    printf("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function,
            msg, expr);
   } else {
-    PRINTF("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr);
+    printf("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr);
   }
   __builtin_trap();
 }
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
index 291ceb023a69c5..e55008f46269fe 100644
--- a/offload/DeviceRTL/src/LibC.cpp
+++ b/offload/DeviceRTL/src/LibC.cpp
@@ -10,32 +10,11 @@
 
 #pragma omp begin declare target device_type(nohost)
 
-namespace impl {
-int32_t omp_vprintf(const char *Format, __builtin_va_list vlist);
-}
-
-#ifndef OMPTARGET_HAS_LIBC
-namespace impl {
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
-extern "C" int vprintf(const char *format, ...);
-int omp_vprintf(const char *Format, __builtin_va_list vlist) {
-  return vprintf(Format, vlist);
-}
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-int omp_vprintf(const char *Format, __builtin_va_list) { return -1; }
-#pragma omp end declare variant
-} // namespace impl
-
-extern "C" int printf(const char *Format, ...) {
-  __builtin_va_list vlist;
-  __builtin_va_start(vlist, Format);
-  return impl::omp_vprintf(Format, vlist);
-}
-#endif // OMPTARGET_HAS_LIBC
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
+#else
+extern "C" int vprintf(const char *format, __builtin_va_list);
+#endif
 
 extern "C" {
 [[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) {
@@ -54,6 +33,20 @@ extern "C" {
   for (size_t I = 0; I < count; ++I)
     dstc[I] = C;
 }
+
+[[gnu::weak]] int printf(const char *Format, ...) {
+  __builtin_va_list vlist;
+  __builtin_va_start(vlist, Format);
+  return ::vprintf(Format, vlist);
+}
+}
+
+namespace ompx {
+[[clang::no_builtin("printf")]] int printf(const char *Format, ...) {
+  __builtin_va_list vlist;
+  __builtin_va_start(vlist, Format);
+  return ::vprintf(Format, vlist);
 }
+} // namespace ompx
 
 #pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index 5286d53b623f0a..a87e363349b1e5 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -36,6 +36,7 @@
 #include "DeviceTypes.h"
 #include "DeviceUtils.h"
 #include "Interface.h"
+#include "LibC.h"
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
@@ -74,7 +75,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
   switch (nargs) {
 #include "generated_microtask_cases.gen"
   default:
-    PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
+    printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
     __builtin_trap();
   }
 }
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 855c74fa58e0a5..100bc8ab47983c 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -138,8 +138,8 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
   }
 
   if (config::isDebugMode(DeviceDebugKind::CommonIssues))
-    PRINT("Shared memory stack full, fallback to dynamic allocation of global "
-          "memory will negatively impact performance.\n");
+    printf("Shared memory stack full, fallback to dynamic allocation of global "
+           "memory will negatively impact performance.\n");
   void *GlobalMemory = memory::allocGlobal(
       AlignedBytes, "Slow path shared memory allocation, insufficient "
                     "shared memory stack memory!");
@@ -173,7 +173,7 @@ void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
   void *Ptr = malloc(Bytes);
   if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
-    PRINT("nullptr returned by malloc!\n");
+    printf("nullptr returned by malloc!\n");
   return Ptr;
 }
 
@@ -277,7 +277,7 @@ void state::enterDataEnvironment(IdentTy *Ident) {
         sizeof(ThreadStates[0]) * mapping::getNumberOfThreadsInBlock();
     void *ThreadStatesPtr =
         memory::allocGlobal(Bytes, "Thread state array allocation");
-    memset(ThreadStatesPtr, 0, Bytes);
+    __builtin_memset(ThreadStatesPtr, 0, Bytes);
     if (!atomic::cas(ThreadStatesBitsPtr, uintptr_t(0),
                      reinterpret_cast<uintptr_t>(ThreadStatesPtr),
                      atomic::seq_cst, atomic::seq_cst))

>From e29254e8438795feb838d3bd7fa954d157fb1e18 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 20 Jan 2025 11:54:36 -0600
Subject: [PATCH 2/3] [OpenMP] Remove usage of pointer-to-member in lookup

Summary:
This is buggy and is currently being tracked in
https://github.com/llvm/llvm-project/issues/123241. For now, replace it
with a macro so that we can use address spaces directly.
---
 offload/DeviceRTL/include/State.h | 76 +++++++++++++++----------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index 565235cd48a913..c487ff29680faa 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -158,63 +158,61 @@ struct DateEnvironmentRAII {
 /// TODO
 void resetStateForThread(uint32_t TId);
 
-inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var,
-                                       IdentTy *Ident, bool ForceTeamState) {
-  if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||
-                 !TeamState.HasThreadState))
-    return TeamState.ICVState.*Var;
-  uint32_t TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(!ThreadStates[TId])) {
-    ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
-        sizeof(ThreadStateTy), "ICV modification outside data environment"));
-    ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!");
-    TeamState.HasThreadState = true;
-    ThreadStates[TId]->init();
+// FIXME: https://github.com/llvm/llvm-project/issues/123241.
+#define lookupForModify32Impl(Member, Ident, ForceTeamState)                   \
+  {                                                                            \
+    if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||          \
+                   !TeamState.HasThreadState))                                 \
+      return TeamState.ICVState.Member;                                        \
+    uint32_t TId = mapping::getThreadIdInBlock();                              \
+    if (OMP_UNLIKELY(!ThreadStates[TId])) {                                    \
+      ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(                   \
+          memory::allocGlobal(sizeof(ThreadStateTy),                           \
+                              "ICV modification outside data environment"));   \
+      ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!");     \
+      TeamState.HasThreadState = true;                                         \
+      ThreadStates[TId]->init();                                               \
+    }                                                                          \
+    return ThreadStates[TId]->ICVState.Member;                                 \
   }
-  return ThreadStates[TId]->ICVState.*Var;
-}
 
-inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var,
-                            bool ForceTeamState) {
-  auto TId = mapping::getThreadIdInBlock();
-  if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&
-                   TeamState.HasThreadState && ThreadStates[TId]))
-    return ThreadStates[TId]->ICVState.*Var;
-  return TeamState.ICVState.*Var;
-}
+// FIXME: https://github.com/llvm/llvm-project/issues/123241.
+#define lookupImpl(Member, ForceTeamState)                                     \
+  {                                                                            \
+    auto TId = mapping::getThreadIdInBlock();                                  \
+    if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&        \
+                     TeamState.HasThreadState && ThreadStates[TId]))           \
+      return ThreadStates[TId]->ICVState.Member;                               \
+    return TeamState.ICVState.Member;                                          \
+  }
 
 [[gnu::always_inline, gnu::flatten]] inline uint32_t &
 lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
   switch (Kind) {
   case state::VK_NThreads:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::NThreadsVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(NThreadsVar, ForceTeamState);
+    lookupForModify32Impl(NThreadsVar, Ident, ForceTeamState);
   case state::VK_Level:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::LevelVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident, ForceTeamState);
+      lookupImpl(LevelVar, ForceTeamState);
+    lookupForModify32Impl(LevelVar, Ident, ForceTeamState);
   case state::VK_ActiveLevel:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::ActiveLevelVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(ActiveLevelVar, ForceTeamState);
+    lookupForModify32Impl(ActiveLevelVar, Ident, ForceTeamState);
   case state::VK_MaxActiveLevels:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::MaxActiveLevelsVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(MaxActiveLevelsVar, ForceTeamState);
+    lookupForModify32Impl(MaxActiveLevelsVar, Ident, ForceTeamState);
   case state::VK_RunSched:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::RunSchedVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(RunSchedVar, ForceTeamState);
+    lookupForModify32Impl(RunSchedVar, Ident, ForceTeamState);
   case state::VK_RunSchedChunk:
     if (IsReadonly)
-      return lookupImpl(&ICVStateTy::RunSchedChunkVar, ForceTeamState);
-    return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident,
-                                 ForceTeamState);
+      lookupImpl(RunSchedChunkVar, ForceTeamState);
+    lookupForModify32Impl(RunSchedChunkVar, Ident, ForceTeamState);
   case state::VK_ParallelTeamSize:
     return TeamState.ParallelTeamSize;
   case state::VK_HasThreadState:

>From f9c871b12f7e22f8f24ad89a26ee383e17dfad2a Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 20 Jan 2025 14:41:34 -0600
Subject: [PATCH 3/3] [OpenMP] Port the OpenMP device runtime to direct C++
 compilation

Summary:
This removes the use of OpenMP offloading to build the device runtime.
The main benefit here is that we no longer need to rely on offloading
semantics to build a device only runtime. Things like variants are now
no longer needed and can just be simple if-defs. In the future, I will
remove most of the special handling here and fold it into calls to the
`<gpuintrin.h>` functions instead. Additionally I will rework the
compilation to make this a separate runtime.

The current plan is to have this, but make including OpenMP and
offloading either automatically add it, or print a warning if it's
missing. This will allow us to use a normal CMake workflow and delete
all the weird 'lets pull the clang binary out of the build' business.
```
-DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=offload
-DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa
```

After that, linking the OpenMP device runtime will be `-Xoffload-linker
-lomp`. I.e. no more fat binary business.
---
 offload/DeviceRTL/CMakeLists.txt            | 18 ++++-------
 offload/DeviceRTL/include/Allocator.h       |  4 ---
 offload/DeviceRTL/include/DeviceTypes.h     | 10 +-----
 offload/DeviceRTL/include/DeviceUtils.h     |  4 ---
 offload/DeviceRTL/include/Mapping.h         |  4 ---
 offload/DeviceRTL/include/State.h           | 10 ++----
 offload/DeviceRTL/include/Synchronization.h |  4 ---
 offload/DeviceRTL/include/Workshare.h       |  4 ---
 offload/DeviceRTL/src/Allocator.cpp         |  4 ---
 offload/DeviceRTL/src/Configuration.cpp     |  4 ---
 offload/DeviceRTL/src/Debug.cpp             |  4 ---
 offload/DeviceRTL/src/DeviceUtils.cpp       | 16 +++------
 offload/DeviceRTL/src/Kernel.cpp            |  4 ---
 offload/DeviceRTL/src/LibC.cpp              |  4 ---
 offload/DeviceRTL/src/Mapping.cpp           | 30 +++--------------
 offload/DeviceRTL/src/Misc.cpp              | 17 +++-------
 offload/DeviceRTL/src/Parallelism.cpp       |  4 ---
 offload/DeviceRTL/src/Profiling.cpp         |  4 ---
 offload/DeviceRTL/src/Reduction.cpp         |  4 ---
 offload/DeviceRTL/src/State.cpp             | 19 ++++-------
 offload/DeviceRTL/src/Synchronization.cpp   | 36 +++------------------
 offload/DeviceRTL/src/Tasking.cpp           |  6 +---
 offload/DeviceRTL/src/Workshare.cpp         |  4 ---
 23 files changed, 34 insertions(+), 184 deletions(-)

diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
index 099634e211e7a7..8f2a1fd01fabcc 100644
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -95,11 +95,10 @@ set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
 list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
 
 # Set flags for LLVM Bitcode compilation.
-set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
-              ${clang_opt_flags} --offload-device-only
-             -nocudalib -nogpulib -nogpuinc -nostdlibinc
-             -fopenmp -fopenmp-cuda-mode
-             -Wno-unknown-cuda-version -Wno-openmp-target
+set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
+             ${clang_opt_flags} -nogpulib -nostdlibinc
+             -fno-rtti -fno-exceptions -fconvergent-functions
+             -Wno-unknown-cuda-version
              -DOMPTARGET_DEVICE_RUNTIME
              -I${include_directory}
              -I${devicertl_base_directory}/../include
@@ -123,8 +122,7 @@ function(compileDeviceRTLLibrary target_name target_triple)
     add_custom_command(OUTPUT ${outfile}
       COMMAND ${CLANG_TOOL}
       ${bc_flags}
-      -fopenmp-targets=${target_triple}
-      -Xopenmp-target=${target_triple} -march=
+      --target=${target_triple}
       ${target_bc_flags}
       -MD -MF ${depfile}
       ${infile} -o ${outfile}
@@ -242,10 +240,8 @@ function(compileDeviceRTLLibrary target_name target_triple)
     set(ide_target_name omptarget-ide-${target_name})
     add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
     target_compile_options(${ide_target_name} PRIVATE
-      -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
-      -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
-      -foffload-lto -fvisibility=hidden --offload-device-only
-      -nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
+      -fvisibility=hidden --target=${target_triple}
+      -nogpulib -nostdlibinc -Wno-unknown-cuda-version
     )
     target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
     target_include_directories(${ide_target_name} PRIVATE
diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h
index 475f6a21bb47eb..79c69a2a96b4e9 100644
--- a/offload/DeviceRTL/include/Allocator.h
+++ b/offload/DeviceRTL/include/Allocator.h
@@ -17,8 +17,6 @@
 // Forward declaration.
 struct KernelEnvironmentTy;
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 
 namespace allocator {
@@ -44,6 +42,4 @@ extern "C" {
 [[gnu::weak]] void free(void *Ptr);
 }
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 1cd044f432e569..308109b0749f05 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -99,14 +99,7 @@ struct TaskDescriptorTy {
   TaskFnTy TaskFn;
 };
 
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
 using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
-
-#pragma omp begin declare variant match(                                       \
-        device = {arch(amdgcn)}, implementation = {extension(match_none)})
-using LaneMaskTy = uint64_t;
-#pragma omp end declare variant
 
 namespace lanes {
 enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
@@ -163,8 +156,7 @@ typedef enum omp_allocator_handle_t {
 #define OMP_PRAGMA(STR) __PRAGMA(omp STR)
 
 #define SHARED(NAME)                                                           \
-  NAME [[clang::loader_uninitialized]];                                        \
-  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+  [[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
 
 // TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
 //       now that's not the case.
diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h
index 2243673aef61c7..b92514ee9838a1 100644
--- a/offload/DeviceRTL/include/DeviceUtils.h
+++ b/offload/DeviceRTL/include/DeviceUtils.h
@@ -15,8 +15,6 @@
 #include "DeviceTypes.h"
 #include "Shared/Utils.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace utils {
 
 template <typename T> struct type_identity {
@@ -95,6 +93,4 @@ bool isThreadLocalMemPtr(void *Ptr);
 
 } // namespace utils
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h
index 2fb87abe5418c0..c53ddbfd0d1004 100644
--- a/offload/DeviceRTL/include/Mapping.h
+++ b/offload/DeviceRTL/include/Mapping.h
@@ -24,12 +24,8 @@ enum {
   DIM_Z = 2,
 };
 
-#pragma omp begin declare target device_type(nohost)
-
 inline constexpr uint32_t MaxThreadsPerTeam = 1024;
 
-#pragma omp end declare target
-
 /// Initialize the mapping machinery.
 void init(bool IsSPMD);
 
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index c487ff29680faa..fbf3b751d2f63d 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -22,8 +22,6 @@
 // Forward declaration.
 struct KernelEnvironmentTy;
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 
 namespace memory {
@@ -88,8 +86,7 @@ struct TeamStateTy {
   ParallelRegionFnTy ParallelRegionFnVar;
 };
 
-extern TeamStateTy TeamState;
-#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
+extern TeamStateTy [[clang::address_space(3)]] TeamState;
 
 struct ThreadStateTy {
 
@@ -115,8 +112,7 @@ struct ThreadStateTy {
   }
 };
 
-extern ThreadStateTy **ThreadStates;
-#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
+extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
 
 /// Initialize the state machinery. Must be called by all threads.
 void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
@@ -378,6 +374,4 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;
 
 } // namespace ompx
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h
index 3205582e87b998..30b5da680f87f4 100644
--- a/offload/DeviceRTL/include/Synchronization.h
+++ b/offload/DeviceRTL/include/Synchronization.h
@@ -15,8 +15,6 @@
 #include "DeviceTypes.h"
 #include "DeviceUtils.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 namespace atomic {
 
@@ -219,6 +217,4 @@ void system(atomic::OrderingTy Ordering);
 
 } // namespace ompx
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h
index fa9b3b2430b8c4..554c3271c334c0 100644
--- a/offload/DeviceRTL/include/Workshare.h
+++ b/offload/DeviceRTL/include/Workshare.h
@@ -12,8 +12,6 @@
 #ifndef OMPTARGET_WORKSHARE_H
 #define OMPTARGET_WORKSHARE_H
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 
 namespace workshare {
@@ -25,6 +23,4 @@ void init(bool IsSPMD);
 
 } // namespace ompx
 
-#pragma omp end declare target
-
 #endif
diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp
index ac662c48d4f5fb..aac2a6005158ef 100644
--- a/offload/DeviceRTL/src/Allocator.cpp
+++ b/offload/DeviceRTL/src/Allocator.cpp
@@ -19,8 +19,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 [[gnu::used, gnu::retain, gnu::weak,
   gnu::visibility(
       "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
@@ -77,5 +75,3 @@ void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
 void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
 
 ///}
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp
index 9e14c203d4a04e..0ffab8bd186116 100644
--- a/offload/DeviceRTL/src/Configuration.cpp
+++ b/offload/DeviceRTL/src/Configuration.cpp
@@ -17,8 +17,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 // Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
 [[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
 [[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
@@ -85,5 +83,3 @@ bool config::mayUseNestedParallelism() {
     return false;
   return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index 1d9c9628854222..5b5482d766b1d0 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -21,8 +21,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 extern "C" {
 void __assert_assume(bool condition) { __builtin_assume(condition); }
 
@@ -44,5 +42,3 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
   __builtin_trap();
 }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp
index c204a7be73b1fc..d8109537832e96 100644
--- a/offload/DeviceRTL/src/DeviceUtils.cpp
+++ b/offload/DeviceRTL/src/DeviceUtils.cpp
@@ -15,14 +15,10 @@
 #include "Interface.h"
 #include "Mapping.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 using namespace ompx;
 
 namespace impl {
 
-bool isSharedMemPtr(const void *Ptr) { return false; }
-
 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
   static_assert(sizeof(unsigned long) == 8, "");
   *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
@@ -42,7 +38,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
 /// AMDGCN Implementation
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
 
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   int Self = mapping::getThreadIdInWarp();
@@ -66,15 +62,13 @@ bool isSharedMemPtr(const void *Ptr) {
   return __builtin_amdgcn_is_shared(
       (const __attribute__((address_space(0))) void *)Ptr);
 }
-#pragma omp end declare variant
+#endif
 ///}
 
 /// NVPTX Implementation
 ///
 ///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
+#ifdef __NVPTX__
 
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
@@ -91,7 +85,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
 
 bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
 
-#pragma omp end declare variant
+#endif
 ///}
 } // namespace impl
 
@@ -137,5 +131,3 @@ int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
   return utils::shuffleDown(lanes::All, Val, Delta, Width);
 }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 8bb275eae776c6..9bb89573dc0cb8 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -25,8 +25,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 static void
 inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
                     KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
@@ -155,5 +153,3 @@ void __kmpc_target_deinit() {
 
 int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
index e55008f46269fe..83f9233d948032 100644
--- a/offload/DeviceRTL/src/LibC.cpp
+++ b/offload/DeviceRTL/src/LibC.cpp
@@ -8,8 +8,6 @@
 
 #include "LibC.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 #if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
 extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
 #else
@@ -48,5 +46,3 @@ namespace ompx {
   return ::vprintf(Format, vlist);
 }
 } // namespace ompx
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 8583a539824c82..a0c0f6721a84cc 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -15,8 +15,6 @@
 #include "Interface.h"
 #include "State.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 using namespace ompx;
@@ -24,24 +22,10 @@ using namespace ompx;
 namespace ompx {
 namespace impl {
 
-// Forward declarations defined to be defined for AMDGCN and NVPTX.
-LaneMaskTy activemask();
-LaneMaskTy lanemaskLT();
-LaneMaskTy lanemaskGT();
-uint32_t getThreadIdInWarp();
-uint32_t getThreadIdInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInBlock(int32_t Dim);
-uint32_t getNumberOfThreadsInKernel();
-uint32_t getBlockIdInKernel(int32_t Dim);
-uint32_t getNumberOfBlocksInKernel(int32_t Dim);
-uint32_t getWarpIdInBlock();
-uint32_t getNumberOfWarpsInBlock();
-uint32_t getWarpSize();
-
 /// AMDGCN Implementation
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
 
 uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
 
@@ -128,15 +112,13 @@ uint32_t getNumberOfWarpsInBlock() {
   return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
 }
 
-#pragma omp end declare variant
+#endif
 ///}
 
 /// NVPTX Implementation
 ///
 ///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
+#ifdef __NVPTX__
 
 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
   switch (Dim) {
@@ -214,7 +196,7 @@ uint32_t getNumberOfWarpsInBlock() {
          mapping::getWarpSize();
 }
 
-#pragma omp end declare variant
+#endif
 ///}
 
 } // namespace impl
@@ -376,7 +358,7 @@ float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
 }
 
 long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
-  return utils::shuffleDown(mask, var, delta, width);
+  return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
 }
 
 double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
@@ -385,5 +367,3 @@ double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
       utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
 }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index ba6fbf5d5c7e3c..7f44faad2adbfd 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -17,19 +17,13 @@
 
 #include "Debug.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace ompx {
 namespace impl {
 
-double getWTick();
-
-double getWTime();
-
 /// AMDGCN Implementation
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
 
 double getWTick() {
   // The number of ticks per second for the AMDGPU clock varies by card and can
@@ -42,14 +36,12 @@ double getWTime() {
   return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
 }
 
-#pragma omp end declare variant
+#endif
 
 /// NVPTX Implementation
 ///
 ///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
+#ifdef __NVPTX__
 
 double getWTick() {
   // Timer precision is 1ns
@@ -61,7 +53,7 @@ double getWTime() {
   return static_cast<double>(nsecs) * getWTick();
 }
 
-#pragma omp end declare variant
+#endif
 
 /// Lookup a device-side function using a host pointer /p HstPtr using the table
 /// provided by the device plugin. The table is an ordered pair of host and
@@ -177,4 +169,3 @@ unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
 }
 
 ///}
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp
index a87e363349b1e5..08ce616aee1c4c 100644
--- a/offload/DeviceRTL/src/Parallelism.cpp
+++ b/offload/DeviceRTL/src/Parallelism.cpp
@@ -43,8 +43,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 namespace {
 
 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
@@ -311,5 +309,3 @@ void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
 
 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp
index bb3caaadcc03dd..df141af5ebeeaf 100644
--- a/offload/DeviceRTL/src/Profiling.cpp
+++ b/offload/DeviceRTL/src/Profiling.cpp
@@ -8,8 +8,6 @@
 
 #include "Profiling.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 extern "C" {
 
 // Provides empty implementations for certain functions in compiler-rt
@@ -18,5 +16,3 @@ void __llvm_profile_register_function(void *Ptr) {}
 void __llvm_profile_register_names_function(void *Ptr, long int I) {}
 void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp
index d3b4528401953c..a0fea871b5ade6 100644
--- a/offload/DeviceRTL/src/Reduction.cpp
+++ b/offload/DeviceRTL/src/Reduction.cpp
@@ -22,8 +22,6 @@ using namespace ompx;
 
 namespace {
 
-#pragma omp begin declare target device_type(nohost)
-
 void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
     shflFct(reduce_data, /*LaneId - not used= */ 0,
@@ -316,5 +314,3 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
 void *__kmpc_reduction_get_fixed_buffer() {
   return state::getKernelLaunchEnvironment().ReductionBuffer;
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 100bc8ab47983c..89edb4802198c9 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -23,16 +23,13 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 /// Memory implementation
 ///
 ///{
 
 /// External symbol to access dynamic shared memory.
-[[gnu::aligned(
-    allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
-#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
+[[gnu::aligned(allocator::ALIGNMENT)]] extern unsigned char
+    [[clang::address_space(3)]] DynamicSharedBuffer[];
 
 /// The kernel environment passed to the init method by the compiler.
 static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
@@ -452,13 +449,10 @@ void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
 
-[[clang::loader_uninitialized]] static void
-    *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
-#pragma omp allocate(SharedMemVariableSharingSpace)                            \
-    allocator(omp_pteam_mem_alloc)
-[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
-#pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
-    allocator(omp_pteam_mem_alloc)
+[[clang::loader_uninitialized]] static void *[[clang::address_space(
+    3)]] SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
+[[clang::loader_uninitialized]] static void **[[clang::address_space(
+    3)]] SharedMemVariableSharingSpacePtr;
 
 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
@@ -481,4 +475,3 @@ void __kmpc_get_shared_variables(void ***GlobalArgs) {
   *GlobalArgs = SharedMemVariableSharingSpacePtr;
 }
 }
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp
index 72a97ae3fcfb42..eaaac2d2e54d07 100644
--- a/offload/DeviceRTL/src/Synchronization.cpp
+++ b/offload/DeviceRTL/src/Synchronization.cpp
@@ -19,8 +19,6 @@
 #include "Mapping.h"
 #include "State.h"
 
-#pragma omp begin declare target device_type(nohost)
-
 using namespace ompx;
 
 namespace impl {
@@ -28,34 +26,12 @@ namespace impl {
 /// Atomics
 ///
 ///{
-/// NOTE: This function needs to be implemented by every target.
-uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope);
 ///}
 
-// Forward declarations defined to be defined for AMDGCN and NVPTX.
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
-                   atomic::MemScopeTy MemScope);
-void namedBarrierInit();
-void namedBarrier();
-void fenceTeam(atomic::OrderingTy Ordering);
-void fenceKernel(atomic::OrderingTy Ordering);
-void fenceSystem(atomic::OrderingTy Ordering);
-void syncWarp(__kmpc_impl_lanemask_t);
-void syncThreads(atomic::OrderingTy Ordering);
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
-void unsetLock(omp_lock_t *);
-int testLock(omp_lock_t *);
-void initLock(omp_lock_t *);
-void destroyLock(omp_lock_t *);
-void setLock(omp_lock_t *);
-void unsetCriticalLock(omp_lock_t *);
-void setCriticalLock(omp_lock_t *);
-
 /// AMDGCN Implementation
 ///
 ///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
+#ifdef __AMDGPU__
 
 uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
                    atomic::MemScopeTy MemScope) {
@@ -198,15 +174,13 @@ void setCriticalLock(omp_lock_t *Lock) {
   }
 }
 
-#pragma omp end declare variant
+#endif
 ///}
 
 /// NVPTX Implementation
 ///
 ///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
+#ifdef __NVPTX__
 
 uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
                    atomic::MemScopeTy MemScope) {
@@ -279,7 +253,7 @@ void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
 
 void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
 
-#pragma omp end declare variant
+#endif
 ///}
 
 } // namespace impl
@@ -397,5 +371,3 @@ void ompx_sync_block_divergent(int Ordering) {
   impl::syncThreads(atomic::OrderingTy(Ordering));
 }
 } // extern "C"
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp
index 23a967c1a337e2..9dcf51175c7d17 100644
--- a/offload/DeviceRTL/src/Tasking.cpp
+++ b/offload/DeviceRTL/src/Tasking.cpp
@@ -20,8 +20,6 @@
 
 using namespace ompx;
 
-#pragma omp begin declare target device_type(nohost)
-
 extern "C" {
 
 TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
@@ -29,7 +27,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
                                         size_t SharedValuesSize,
                                         TaskFnTy TaskFn) {
   auto TaskSizeInclPrivateValuesPadded =
-      utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
+      utils::roundUp(TaskSizeInclPrivateValues, size_t(sizeof(void *)));
   auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
   TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
       TaskSizeTotal, "explicit task descriptor");
@@ -103,5 +101,3 @@ int omp_in_final(void) {
 
 int omp_get_max_task_priority(void) { return 0; }
 }
-
-#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp
index ad60e66548be90..fd47d594a9a0ea 100644
--- a/offload/DeviceRTL/src/Workshare.cpp
+++ b/offload/DeviceRTL/src/Workshare.cpp
@@ -44,8 +44,6 @@ struct DynamicScheduleTracker {
 #define NOT_FINISHED 1
 #define LAST_CHUNK 2
 
-#pragma omp begin declare target device_type(nohost)
-
 // TODO: This variable is a hack inherited from the old runtime.
 static uint64_t SHARED(Cnt);
 
@@ -935,5 +933,3 @@ OMP_LOOP_ENTRY(_4u, uint32_t)
 OMP_LOOP_ENTRY(_8, int64_t)
 OMP_LOOP_ENTRY(_8u, uint64_t)
 }
-
-#pragma omp end declare target