[Openmp-commits] [openmp] 67ab875 - [OpenMP] Prototype opt-in new GPU device RTL

Mon Jul 26 22:56:50 PDT 2021

Author: Johannes Doerfert
Date: 2021-07-27T00:56:05-05:00
New Revision: 67ab875ff578588574a63d29d52f73fd25128c74

URL: https://github.com/llvm/llvm-project/commit/67ab875ff578588574a63d29d52f73fd25128c74
DIFF: https://github.com/llvm/llvm-project/commit/67ab875ff578588574a63d29d52f73fd25128c74.diff

LOG: [OpenMP] Prototype opt-in new GPU device RTL

The "old" OpenMP GPU device runtime (D14254) has served us well for many
years but modernizing it has caused some pain recently. This patch
introduces an alternative which is mostly written from scratch embracing
OpenMP 5.X, C++, LLVM coding style (where applicable), and conceptual
interfaces. This new runtime is opt-in through a clang flag (D106793).
The new runtime is currently only build for nvptx and has "-new" in its
name.

The design is tailored towards middle-end optimizations rather than
front-end code generation choices, a trend we already started in the old
runtime a while back. In contrast to the old one, state is organized in
a simple manner rather than a "smart" one. While this can induce costs
it helps optimizations. Our expectation is that the majority of codes
can be optimized and a "simple" design is therefore preferable. The new
runtime does also avoid users to pay for things they do not use,
especially wrt. memory. The unlikely case of nested parallelism is
supported but costly to make the more likely case use less resources.

The worksharing and reduction implementation have been taken from the
old runtime and will be rewritten in the future if necessary.

Documentation and debug features are still mostly missing and will be
added over time.

All external symbols start with `__kmpc` for legacy reasons but should
be renamed once we switch over to a single runtime. All internal symbols
are placed in appropriate namespaces (anonymous or `_OMP`) to avoid name
clashes with user symbols.

Differential Revision: https://reviews.llvm.org/D106803

Added: 
    openmp/libomptarget/DeviceRTL/CMakeLists.txt
    openmp/libomptarget/DeviceRTL/include/Configuration.h
    openmp/libomptarget/DeviceRTL/include/Debug.h
    openmp/libomptarget/DeviceRTL/include/Interface.h
    openmp/libomptarget/DeviceRTL/include/Mapping.h
    openmp/libomptarget/DeviceRTL/include/State.h
    openmp/libomptarget/DeviceRTL/include/Synchronization.h
    openmp/libomptarget/DeviceRTL/include/Types.h
    openmp/libomptarget/DeviceRTL/include/Utils.h
    openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen
    openmp/libomptarget/DeviceRTL/src/Configuration.cpp
    openmp/libomptarget/DeviceRTL/src/Debug.cpp
    openmp/libomptarget/DeviceRTL/src/Kernel.cpp
    openmp/libomptarget/DeviceRTL/src/Mapping.cpp
    openmp/libomptarget/DeviceRTL/src/Misc.cpp
    openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
    openmp/libomptarget/DeviceRTL/src/Reduction.cpp
    openmp/libomptarget/DeviceRTL/src/State.cpp
    openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
    openmp/libomptarget/DeviceRTL/src/Tasking.cpp
    openmp/libomptarget/DeviceRTL/src/Utils.cpp
    openmp/libomptarget/DeviceRTL/src/Workshare.cpp

Modified: 
    openmp/libomptarget/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 6e7ecdc15252..3f709781893e 100644

--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -78,6 +78,7 @@ set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${LIBOMP_LIBRARY_DIR}" CACHE STRING
 # Build offloading plugins and device RTLs if they are available.
 add_subdirectory(plugins)
 add_subdirectory(deviceRTLs)
+add_subdirectory(DeviceRTL)
 
 # Add tests.
 add_subdirectory(test)

diff  --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
new file mode 100644
index 000000000000..5e98104a46a9
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -0,0 +1,208 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the Device RTL for all toolchains that are available
+#
+##===----------------------------------------------------------------------===##
+
+# TODO: copied from NVPTX, need to be generalized.
+
+# By default we will not build NVPTX deviceRTL on a CUDA free system
+set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL
+  "Whether build NVPTX deviceRTL on CUDA free system.")
+
+if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB))
+  libomptarget_say("Not building NVPTX deviceRTL by default on CUDA free system.")
+  return()
+endif()
+
+# Check if we can create an LLVM bitcode implementation of the runtime library
+# that could be inlined in the user application. For that we need to find
+# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
+# an LLVM linker.
+set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
+  "Location of a CUDA compiler capable of emitting LLVM bitcode.")
+set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
+  "Location of a linker capable of linking LLVM bitcode objects.")
+
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
+  set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
+elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(cuda_compiler ${CMAKE_C_COMPILER})
+else()
+  libomptarget_say("Not building NVPTX deviceRTL: clang not found")
+  return()
+endif()
+
+# Get compiler directory to try to locate a suitable linker.
+get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
+set(llvm_link "${compiler_dir}/llvm-link")
+set(opt "${compiler_dir}/opt")
+
+if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
+  set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
+elseif (EXISTS ${llvm_link})
+  set(bc_linker ${llvm_link})
+else()
+  libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
+  return()
+endif()
+
+# TODO: This part needs to be refined when libomptarget is going to support
+# Windows!
+# TODO: This part can also be removed if we can change the clang driver to make
+# it support device only compilation.
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
+  set(aux_triple x86_64-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
+  set(aux_triple powerpc64le-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(aux_triple aarch64-unknown-linux-gnu)
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+  return()
+endif()
+
+set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
+set(include_directory ${devicertl_base_directory}/include)
+set(source_directory ${devicertl_base_directory}/src)
+
+set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80)
+
+set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING
+  "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
+string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES)
+
+if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all")
+  set(nvptx_sm_list ${all_capabilities})
+elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto")
+  if (NOT LIBOMPTARGET_DEP_CUDA_FOUND)
+    libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.")
+  endif()
+  set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH})
+else()
+  string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}")
+endif()
+
+# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
+# build.
+if (NOT nvptx_sm_list)
+  libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
+  return()
+endif()
+
+# Check all SM values
+foreach(sm ${nvptx_sm_list})
+  if (NOT ${sm} IN_LIST all_capabilities)
+    libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.")
+  endif()
+endforeach()
+
+# Override default MAX_SM in src/target_impl.h if requested
+if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
+  set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
+endif()
+
+# Activate RTL message dumps if requested by the user.
+set(LIBOMPTARGET_DEVICE_DEBUG FALSE CACHE BOOL
+  "Activate NVPTX device RTL debug messages.")
+
+libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
+
+set(src_files
+  ${source_directory}/Configuration.cpp
+  ${source_directory}/Debug.cpp
+  ${source_directory}/Kernel.cpp
+  ${source_directory}/Mapping.cpp
+  ${source_directory}/Misc.cpp
+  ${source_directory}/Parallelism.cpp
+  ${source_directory}/Reduction.cpp
+  ${source_directory}/State.cpp
+  ${source_directory}/Synchronization.cpp
+  ${source_directory}/Tasking.cpp
+  ${source_directory}/Utils.cpp
+  ${source_directory}/Workshare.cpp
+)
+
+set(clang_opt_flags -O1 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=2048)
+set(link_opt_flags  -O1        -openmp-opt-disable)
+
+# Set flags for LLVM Bitcode compilation.
+set(bc_flags -S -x c++ -std=c++17
+              ${clang_opt_flags}
+             -target nvptx64
+             -Xclang -emit-llvm-bc
+             -Xclang -aux-triple -Xclang ${aux_triple}
+             -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+             -Xclang -target-feature -Xclang +ptx61
+             -I${include_directory}
+)
+
+if(${LIBOMPTARGET_DEVICE_DEBUG})
+  list(APPEND bc_flags -DOMPTARGET_DEBUG=-1)
+else()
+  list(APPEND bc_flags -DOMPTARGET_DEBUG=0)
+endif()
+
+# Create target to build all Bitcode libraries.
+add_custom_target(omptarget-new-nvptx-bc)
+
+# Generate a Bitcode library for all the compute capabilities the user requested
+foreach(sm ${nvptx_sm_list})
+  # TODO: replace this with declare variant and isa selector.
+  set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+  set(bc_files "")
+  foreach(src ${src_files})
+    get_filename_component(infile ${src} ABSOLUTE)
+    get_filename_component(outfile ${src} NAME)
+    set(outfile "${outfile}-sm_${sm}.bc")
+
+    add_custom_command(OUTPUT ${outfile}
+      COMMAND ${cuda_compiler} ${bc_flags}
+        ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+      DEPENDS ${infile}
+      IMPLICIT_DEPENDS CXX ${infile}
+      COMMENT "Building LLVM bitcode ${outfile}"
+      VERBATIM
+    )
+    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+    list(APPEND bc_files ${outfile})
+  endforeach()
+
+  set(bclib_name "libomptarget-new-nvptx-sm_${sm}.bc")
+
+  # Link to a bitcode library.
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      COMMAND ${bc_linker}
+        -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+      DEPENDS ${bc_files}
+      COMMENT "Linking LLVM bitcode ${bclib_name}"
+  )
+
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt
+      COMMAND ${opt} ${link_opt_flags} ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                      -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      COMMENT "Optimizing LLVM bitcode ${bclib_name}"
+  )
+  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
+
+  set(bclib_target_name "omptarget-new-nvptx-sm_${sm}-bc")
+
+  add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt)
+  add_dependencies(omptarget-new-nvptx-bc ${bclib_target_name})
+
+  # Copy library to destination.
+  add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                    ${LIBOMPTARGET_LIBRARY_DIR})
+
+  # Install bitcode library under the lib destination folder.
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+endforeach()

diff  --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
new file mode 100644
index 000000000000..6fef6b709ba3
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -0,0 +1,35 @@
+//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// API to query the global (constant) device environment.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_CONFIGURATION_H
+#define OMPTARGET_CONFIGURATION_H
+
+#include "Types.h"
+
+namespace _OMP {
+namespace config {
+
+enum DebugLevel : int32_t { Assertion };
+
+/// Return the number of devices in the system, same number as returned on the
+/// host by omp_get_num_devices.
+uint32_t getNumDevices();
+
+/// Return the user choosen debug level.
+int32_t getDebugLevel();
+
+bool isDebugMode(DebugLevel Level);
+
+} // namespace config
+} // namespace _OMP
+
+#endif

diff  --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h
new file mode 100644
index 000000000000..b304bff5302f
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -0,0 +1,30 @@
+//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_DEBUG_H
+#define OMPTARGET_DEVICERTL_DEBUG_H
+
+/// Assertion
+///
+/// {
+extern "C" {
+void __assert_assume(bool cond, const char *exp, const char *file, int line);
+}
+
+#define ASSERT(e) __assert_assume(e, #e, __FILE__, __LINE__)
+
+///}
+
+// TODO: We need to allow actual printf.
+#define PRINTF(fmt, ...) (void)fmt;
+#define PRINT(str) PRINTF("%s", str)
+
+#endif

diff  --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
new file mode 100644
index 000000000000..1c4c877b7662
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -0,0 +1,345 @@
+//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_INTERFACE_H
+#define OMPTARGET_DEVICERTL_INTERFACE_H
+
+#include "Types.h"
+
+/// External API
+///
+///{
+
+extern "C" {
+
+/// ICV: dyn-var, constant 0
+///
+/// setter: ignored.
+/// getter: returns 0.
+///
+///{
+void omp_set_dynamic(int);
+int omp_get_dynamic(void);
+///}
+
+/// ICV: nthreads-var, integer
+///
+/// scope: data environment
+///
+/// setter: ignored.
+/// getter: returns false.
+///
+/// implementation notes:
+///
+///
+///{
+void omp_set_num_threads(int);
+int omp_get_max_threads(void);
+///}
+
+/// ICV: thread-limit-var, computed
+///
+/// getter: returns thread limited defined during launch.
+///
+///{
+int omp_get_thread_limit(void);
+///}
+
+/// ICV: max-active-level-var, constant 1
+///
+/// setter: ignored.
+/// getter: returns 1.
+///
+///{
+void omp_set_max_active_levels(int);
+int omp_get_max_active_levels(void);
+///}
+
+/// ICV: places-partition-var
+///
+///
+///{
+///}
+
+/// ICV: active-level-var, 0 or 1
+///
+/// getter: returns 0 or 1.
+///
+///{
+int omp_get_active_level(void);
+///}
+
+/// ICV: level-var
+///
+/// getter: returns parallel region nesting
+///
+///{
+int omp_get_level(void);
+///}
+
+/// ICV: run-sched-var
+///
+///
+///{
+void omp_set_schedule(omp_sched_t, int);
+void omp_get_schedule(omp_sched_t *, int *);
+///}
+
+/// TODO this is incomplete.
+int omp_get_num_threads(void);
+int omp_get_thread_num(void);
+void omp_set_nested(int);
+
+int omp_get_nested(void);
+
+void omp_set_max_active_levels(int Level);
+
+int omp_get_max_active_levels(void);
+
+omp_proc_bind_t omp_get_proc_bind(void);
+
+int omp_get_num_places(void);
+
+int omp_get_place_num_procs(int place_num);
+
+void omp_get_place_proc_ids(int place_num, int *ids);
+
+int omp_get_place_num(void);
+
+int omp_get_partition_num_places(void);
+
+void omp_get_partition_place_nums(int *place_nums);
+
+int omp_get_cancellation(void);
+
+void omp_set_default_device(int deviceId);
+
+int omp_get_default_device(void);
+
+int omp_get_num_devices(void);
+
+int omp_get_num_teams(void);
+
+int omp_get_team_num();
+
+int omp_get_initial_device(void);
+
+/// Synchronization
+///
+///{
+void omp_init_lock(omp_lock_t *Lock);
+
+void omp_destroy_lock(omp_lock_t *Lock);
+
+void omp_set_lock(omp_lock_t *Lock);
+
+void omp_unset_lock(omp_lock_t *Lock);
+
+int omp_test_lock(omp_lock_t *Lock);
+///}
+
+/// Tasking
+///
+///{
+int omp_in_final(void);
+
+int omp_get_max_task_priority(void);
+///}
+
+/// Misc
+///
+///{
+double omp_get_wtick(void);
+
+double omp_get_wtime(void);
+///}
+}
+
+extern "C" {
+/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
+/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
+/// called by any thread, allocation happens *per thread*.
+void *__kmpc_alloc_shared(uint64_t Bytes);
+
+/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
+/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the
+/// allocated by __kmpc_alloc_shared by the same thread.
+void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
+
+/// Allocate sufficient space for \p NumArgs sequential `void*` and store the
+/// allocation address in \p GlobalArgs.
+///
+/// Called by the main thread prior to a parallel region.
+///
+/// We also remember it in GlobalArgsPtr to ensure the worker threads and
+/// deallocation function know the allocation address too.
+void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs);
+
+/// Deallocate the memory allocated by __kmpc_begin_sharing_variables.
+///
+/// Called by the main thread after a parallel region.
+void __kmpc_end_sharing_variables(void **GlobalArgs, uint64_t NumArgs);
+
+/// Store the allocation address obtained via __kmpc_begin_sharing_variables in
+/// \p GlobalArgs.
+///
+/// Called by the worker threads in the parallel region (function).
+void __kmpc_get_shared_variables(void ***GlobalArgs);
+
+/// Kernel
+///
+///{
+int8_t __kmpc_is_spmd_exec_mode();
+
+int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
+                           bool UseGenericStateMachine, bool);
+
+void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool);
+
+///}
+
+/// Reduction
+///
+///{
+void __kmpc_nvptx_end_reduce(int32_t TId);
+
+void __kmpc_nvptx_end_reduce_nowait(int32_t TId);
+
+int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct);
+
+int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
+    ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
+    ListGlobalFnTy glredFct);
+///}
+
+/// Synchronization
+///
+///{
+void __kmpc_ordered(IdentTy *Loc, int32_t TId);
+
+void __kmpc_end_ordered(IdentTy *Loc, int32_t TId);
+
+int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
+
+void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
+
+void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
+
+int32_t __kmpc_master(IdentTy *Loc, int32_t TId);
+
+void __kmpc_end_master(IdentTy *Loc, int32_t TId);
+
+int32_t __kmpc_single(IdentTy *Loc, int32_t TId);
+
+void __kmpc_end_single(IdentTy *Loc, int32_t TId);
+
+void __kmpc_flush(IdentTy *Loc);
+
+__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
+
+void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask);
+
+void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
+
+void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
+///}
+
+/// Parallelism
+///
+///{
+/// TODO
+void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn);
+
+/// TODO
+bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn);
+
+/// TODO
+void __kmpc_kernel_end_parallel();
+
+/// TODO
+void __kmpc_serialized_parallel(IdentTy *Loc, uint32_t);
+
+/// TODO
+void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t);
+
+/// TODO
+void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
+
+/// TODO
+void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams,
+                           int32_t ThreadLimit);
+
+/// TODO
+uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t);
+
+/// TODO
+void __kmpc_push_num_threads(IdentTy *Loc, int32_t, int32_t NumThreads);
+///}
+
+/// Tasking
+///
+///{
+TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
+                                        uint32_t TaskSizeInclPrivateValues,
+                                        uint32_t SharedValuesSize,
+                                        TaskFnTy TaskFn);
+
+int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
+                        TaskDescriptorTy *TaskDescriptor);
+
+int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor, int32_t,
+                                  void *, int32_t, void *);
+
+void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
+                               TaskDescriptorTy *TaskDescriptor);
+
+void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor);
+
+void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
+                          void *);
+
+void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId);
+
+void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId);
+
+int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int);
+
+int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId);
+
+void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
+                     TaskDescriptorTy *TaskDescriptor, int,
+                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
+                     int32_t, uint64_t, void *);
+///}
+
+/// Misc
+///
+///{
+int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal);
+
+int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
+///}
+
+/// Shuffle
+///
+///{
+int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
+int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+///}
+}
+
+#endif

diff  --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h
new file mode 100644
index 000000000000..b34ecf4f7738
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h
@@ -0,0 +1,86 @@
+//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_MAPPING_H
+#define OMPTARGET_MAPPING_H
+
+#include "Types.h"
+
+namespace _OMP {
+
+namespace mapping {
+
+#pragma omp declare target
+
+inline constexpr uint32_t MaxThreadsPerTeam = 1024;
+
+#pragma omp end declare target
+
+/// Initialize the mapping machinery.
+void init(bool IsSPMD);
+
+/// Return true if the kernel is executed in SPMD mode.
+bool isSPMDMode();
+
+/// Return true if the kernel is executed in generic mode.
+bool isGenericMode();
+
+/// Return true if the executing thread is the main thread in generic mode.
+bool isMainThreadInGenericMode();
+
+/// Return true if the executing thread has the lowest Id of the active threads
+/// in the warp.
+bool isLeaderInWarp();
+
+/// Return a mask describing all active threads in the warp.
+LaneMaskTy activemask();
+
+/// Return a mask describing all threads with a smaller Id in the warp.
+LaneMaskTy lanemaskLT();
+
+/// Return a mask describing all threads with a larget Id in the warp.
+LaneMaskTy lanemaskGT();
+
+/// Return the thread Id in the warp, in [0, getWarpSize()).
+uint32_t getThreadIdInWarp();
+
+/// Return the thread Id in the block, in [0, getBlockSize()).
+uint32_t getThreadIdInBlock();
+
+/// Return the warp id in the block.
+uint32_t getWarpId();
+
+/// Return the warp size, thus number of threads in the warp.
+uint32_t getWarpSize();
+
+/// Return the number of warps in the block.
+uint32_t getNumberOfWarpsInBlock();
+
+/// Return the block Id in the kernel, in [0, getKernelSize()).
+uint32_t getBlockId();
+
+/// Return the block size, thus number of threads in the block.
+uint32_t getBlockSize();
+
+/// Return the number of blocks in the kernel.
+uint32_t getNumberOfBlocks();
+
+/// Return the kernel size, thus number of threads in the kernel.
+uint32_t getKernelSize();
+
+/// Return the number of processing elements on the device.
+uint32_t getNumberOfProcessorElements();
+
+} // namespace mapping
+
+} // namespace _OMP
+
+#endif

diff  --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
new file mode 100644
index 000000000000..63e0923d4154
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -0,0 +1,200 @@
+//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_STATE_H
+#define OMPTARGET_STATE_H
+
+#include "Debug.h"
+#include "Types.h"
+
+#pragma omp declare target
+
+namespace _OMP {
+
+namespace state {
+
+inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
+
+/// Initialize the state machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// TODO
+enum ValueKind {
+  VK_NThreads,
+  VK_Level,
+  VK_ActiveLevel,
+  VK_MaxActiveLevels,
+  VK_RunSched,
+  // ---
+  VK_RunSchedChunk,
+  VK_ParallelRegionFn,
+  VK_ParallelTeamSize,
+};
+
+/// TODO
+void enterDataEnvironment();
+
+/// TODO
+void exitDataEnvironment();
+
+/// TODO
+struct DateEnvironmentRAII {
+  DateEnvironmentRAII() { enterDataEnvironment(); }
+  ~DateEnvironmentRAII() { exitDataEnvironment(); }
+};
+
+/// TODO
+void resetStateForThread(uint32_t TId);
+
+uint32_t &lookup32(ValueKind VK, bool IsReadonly);
+void *&lookupPtr(ValueKind VK, bool IsReadonly);
+
+/// A class without actual state used to provide a nice interface to lookup and
+/// update ICV values we can declare in global scope.
+template <typename Ty, ValueKind Kind> struct Value {
+  __attribute__((flatten, always_inline)) operator Ty() {
+    return lookup(/* IsReadonly */ true);
+  }
+
+  __attribute__((flatten, always_inline)) Value &operator=(const Ty &Other) {
+    set(Other);
+    return *this;
+  }
+
+  __attribute__((flatten, always_inline)) Value &operator++() {
+    inc(1);
+    return *this;
+  }
+
+  __attribute__((flatten, always_inline)) Value &operator--() {
+    inc(-1);
+    return *this;
+  }
+
+private:
+  Ty &lookup(bool IsReadonly) {
+    Ty &t = lookup32(Kind, IsReadonly);
+    return t;
+  }
+
+  Ty &inc(int UpdateVal) {
+    return (lookup(/* IsReadonly */ false) += UpdateVal);
+  }
+
+  Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); }
+
+  template <typename VTy, typename Ty2> friend struct ValueRAII;
+};
+
+/// A mookup class without actual state used to provide
+/// a nice interface to lookup and update ICV values
+/// we can declare in global scope.
+template <typename Ty, ValueKind Kind> struct PtrValue {
+  __attribute__((flatten, always_inline)) operator Ty() {
+    return lookup(/* IsReadonly */ true);
+  }
+
+  __attribute__((flatten, always_inline)) PtrValue &operator=(const Ty Other) {
+    set(Other);
+    return *this;
+  }
+
+private:
+  Ty &lookup(bool IsReadonly) { return lookupPtr(Kind, IsReadonly); }
+
+  Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); }
+
+  template <typename VTy, typename Ty2> friend struct ValueRAII;
+};
+
+template <typename VTy, typename Ty> struct ValueRAII {
+  ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active)
+      : Ptr(Active ? V.lookup(/* IsReadonly */ false) : Val), Val(OldValue),
+        Active(Active) {
+    if (!Active)
+      return;
+    ASSERT(Ptr == OldValue && "ValueRAII initialization with wrong old value!");
+    Ptr = NewValue;
+  }
+  ~ValueRAII() {
+    if (Active)
+      Ptr = Val;
+  }
+
+private:
+  Ty &Ptr;
+  Ty Val;
+  bool Active;
+};
+
+/// TODO
+inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
+
+/// TODO
+inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
+    ParallelRegionFn;
+
+void runAndCheckState(void(Func(void)));
+
+void assumeInitialState(bool IsSPMD);
+
+} // namespace state
+
+namespace icv {
+
+/// TODO
+inline state::Value<uint32_t, state::VK_NThreads> NThreads;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_Level> Level;
+
+/// The `active-level` describes which of the parallel level counted with the
+/// `level-var` is active. There can only be one.
+///
+/// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0.
+inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_RunSched> RunSched;
+
+} // namespace icv
+
+namespace memory {
+
+/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
+///
+/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
+void *allocShared(uint64_t Size, const char *Reason);
+
+/// Free \p Ptr, alloated via allocShared, for \p Reason.
+///
+/// Note: See the restrictions on __kmpc_free_shared for proper usage.
+void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
+
+/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
+void *allocGlobal(uint64_t Size, const char *Reason);
+
+/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
+void freeGlobal(void *Ptr, const char *Reason);
+
+} // namespace memory
+
+} // namespace _OMP
+
+#pragma omp end declare target
+
+#endif

diff  --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
new file mode 100644
index 000000000000..ace624e3887f
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -0,0 +1,69 @@
+//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
+#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
+
+#include "Types.h"
+
+namespace _OMP {
+
+namespace synchronize {
+
+/// Initialize the synchronization machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// Synchronize all threads in a warp identified by \p Mask.
+void warp(LaneMaskTy Mask);
+
+/// Synchronize all threads in a block.
+void threads();
+
+} // namespace synchronize
+
+namespace fence {
+
+/// Memory fence with \p Ordering semantics for the team.
+void team(int Ordering);
+
+/// Memory fence with \p Ordering semantics for the contention group.
+void kernel(int Ordering);
+
+/// Memory fence with \p Ordering semantics for the system.
+void system(int Ordering);
+
+} // namespace fence
+
+namespace atomic {
+
+/// Atomically read \p Addr with \p Ordering semantics.
+uint32_t read(uint32_t *Addr, int Ordering);
+
+/// Atomically store \p V to \p Addr with \p Ordering semantics.
+uint32_t store(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically store \p V to \p Addr with \p Ordering semantics.
+uint64_t store(uint64_t *Addr, uint64_t V, int Ordering);
+
+/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
+uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
+
+} // namespace atomic
+
+} // namespace _OMP
+
+#endif

diff  --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h
new file mode 100644
index 000000000000..5b15778d5a2e
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Types.h
@@ -0,0 +1,200 @@
+//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_TYPES_H
+#define OMPTARGET_TYPES_H
+
+/// Base type declarations for freestanding mode
+///
+///{
+using int8_t = char;
+using uint8_t = unsigned char;
+using int16_t = short;
+using uint16_t = unsigned short;
+using int32_t = int;
+using uint32_t = unsigned int;
+using int64_t = long;
+using uint64_t = unsigned long;
+
+static_assert(sizeof(int8_t) == 1, "type size mismatch");
+static_assert(sizeof(uint8_t) == 1, "type size mismatch");
+static_assert(sizeof(int16_t) == 2, "type size mismatch");
+static_assert(sizeof(uint16_t) == 2, "type size mismatch");
+static_assert(sizeof(int32_t) == 4, "type size mismatch");
+static_assert(sizeof(uint32_t) == 4, "type size mismatch");
+static_assert(sizeof(int64_t) == 8, "type size mismatch");
+static_assert(sizeof(uint64_t) == 8, "type size mismatch");
+///}
+
+enum omp_proc_bind_t {
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+};
+
+enum omp_sched_t {
+  omp_sched_static = 1,  /* chunkSize >0 */
+  omp_sched_dynamic = 2, /* chunkSize >0 */
+  omp_sched_guided = 3,  /* chunkSize >0 */
+  omp_sched_auto = 4,    /* no chunkSize */
+};
+
+enum kmp_sched_t {
+  kmp_sched_static_chunk = 33,
+  kmp_sched_static_nochunk = 34,
+  kmp_sched_dynamic = 35,
+  kmp_sched_guided = 36,
+  kmp_sched_runtime = 37,
+  kmp_sched_auto = 38,
+
+  kmp_sched_static_balanced_chunk = 45,
+
+  kmp_sched_static_ordered = 65,
+  kmp_sched_static_nochunk_ordered = 66,
+  kmp_sched_dynamic_ordered = 67,
+  kmp_sched_guided_ordered = 68,
+  kmp_sched_runtime_ordered = 69,
+  kmp_sched_auto_ordered = 70,
+
+  kmp_sched_distr_static_chunk = 91,
+  kmp_sched_distr_static_nochunk = 92,
+  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
+
+  kmp_sched_default = kmp_sched_static_nochunk,
+  kmp_sched_unordered_first = kmp_sched_static_chunk,
+  kmp_sched_unordered_last = kmp_sched_auto,
+  kmp_sched_ordered_first = kmp_sched_static_ordered,
+  kmp_sched_ordered_last = kmp_sched_auto_ordered,
+  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
+  kmp_sched_distribute_last =
+      kmp_sched_distr_static_chunk_sched_static_chunkone,
+
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
+   * Since we need to distinguish the three possible cases (no modifier,
+   * monotonic modifier, nonmonotonic modifier), we need separate bits for
+   * each modifier. The absence of monotonic does not imply nonmonotonic,
+   * especially since 4.5 says that the behaviour of the "no modifier" case
+   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
+   *
+   * Since we're passing a full 32 bit value, we can use a couple of high
+   * bits for these flags; out of paranoia we avoid the sign bit.
+   *
+   * These modifiers can be or-ed into non-static schedules by the compiler
+   * to pass the additional information. They will be stripped early in the
+   * processing in __kmp_dispatch_init when setting up schedules, so
+   * most of the code won't ever see schedules with these bits set.
+   */
+  kmp_sched_modifier_monotonic = (1 << 29),
+  /**< Set if the monotonic schedule modifier was present */
+  kmp_sched_modifier_nonmonotonic = (1 << 30),
+/**< Set if the nonmonotonic schedule modifier was present */
+
+#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
+  (enum kmp_sched_t)(                                                          \
+      (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
+  (((s)&kmp_sched_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
+  (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
+   0)
+
+};
+
+struct TaskDescriptorTy;
+using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr);
+struct TaskDescriptorTy {
+  void *Payload;
+  TaskFnTy TaskFn;
+};
+
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+using LaneMaskTy = uint64_t;
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(amdgcn)}, implementation = {extension(match_none)})
+using LaneMaskTy = uint64_t;
+#pragma omp end declare variant
+
+namespace lanes {
+enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
+} // namespace lanes
+
+/// The ident structure that describes a source location. The struct is
+/// identical to the one in the kmp.h file. We maintain the same data structure
+/// for compatibility.
+struct IdentTy {
+  int32_t reserved_1;  /**<  might be used in Fortran; see above  */
+  int32_t flags;       /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
+                            identifies this union member  */
+  int32_t reserved_2;  /**<  not really used in Fortran any more; see above */
+  int32_t reserved_3;  /**<  source[4] in Fortran, do not use for C++  */
+  char const *psource; /**<  String describing the source location.
+                       The string is composed of semi-colon separated fields
+                       which describe the source file, the function and a pair
+                       of line numbers that delimit the construct. */
+};
+
+using __kmpc_impl_lanemask_t = LaneMaskTy;
+
+using ParallelRegionFnTy = void *;
+
+using CriticalNameTy = int32_t[8];
+
+struct omp_lock_t {
+  void *Lock;
+};
+
+using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num);
+using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id,
+                                   int16_t lane_offset, int16_t shortCircuit);
+using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data);
+
+/// Macros for allocating variables in 
diff erent address spaces.
+///{
+
+// Follows the pattern in interface.h
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
+} omp_allocator_handle_t;
+
+#define __PRAGMA(STR) _Pragma(#STR)
+#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
+
+#define SHARED(NAME)                                                           \
+  NAME [[clang::loader_uninitialized]];                                        \
+  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
+//       now that's not the case.
+#define THREAD_LOCAL(NAME)                                                     \
+  NAME [[clang::loader_uninitialized, clang::address_space(5)]]
+
+// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
+//       does?
+#define CONSTANT(NAME)                                                         \
+  NAME [[clang::loader_uninitialized, clang::address_space(4)]]
+
+///}
+
+#endif

diff  --git a/openmp/libomptarget/DeviceRTL/include/Utils.h b/openmp/libomptarget/DeviceRTL/include/Utils.h
new file mode 100644
index 000000000000..912c40781612
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Utils.h
@@ -0,0 +1,72 @@
+//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_UTILS_H
+#define OMPTARGET_DEVICERTL_UTILS_H
+
+#include "Types.h"
+
+namespace _OMP {
+namespace utils {
+
+/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
+/// is identified by \p Mask.
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+
+/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
+uint64_t pack(uint32_t LowBits, uint32_t HighBits);
+
+/// Unpack \p Val into \p LowBits and \p HighBits.
+void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
+
+/// Round up \p V to a \p Boundary.
+template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
+  return (V + Boundary - 1) / Boundary * Boundary;
+}
+
+/// Advance \p Ptr by \p Bytes bytes.
+template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
+  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_ffs(V);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_ffsl(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint32_t V) {
+  static_assert(sizeof(int) == sizeof(uint32_t), "type size mismatch");
+  return __builtin_popcount(V);
+}
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint64_t V) {
+  static_assert(sizeof(long) == sizeof(uint64_t), "type size mismatch");
+  return __builtin_popcountl(V);
+}
+
+#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
+
+} // namespace utils
+} // namespace _OMP
+
+#endif

diff  --git a/openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen b/openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen
new file mode 100644
index 000000000000..5abbbe4ba652
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/generated_microtask_cases.gen
@@ -0,0 +1,405 @@
+case 0:
+((void (*)(int32_t *, int32_t *
+))fn)(&global_tid, &bound_tid
+);
+break;
+case 1:
+((void (*)(int32_t *, int32_t *
+, void *))fn)(&global_tid, &bound_tid
+, args[0]);
+break;
+case 2:
+((void (*)(int32_t *, int32_t *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1]);
+break;
+case 3:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2]);
+break;
+case 4:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+);
+break;
+case 5:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4]);
+break;
+case 6:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5]);
+break;
+case 7:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6]);
+break;
+case 8:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+);
+break;
+case 9:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8]);
+break;
+case 10:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9]);
+break;
+case 11:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10]);
+break;
+case 12:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+);
+break;
+case 13:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12]);
+break;
+case 14:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13]);
+break;
+case 15:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14]);
+break;
+case 16:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+);
+break;
+case 17:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16]);
+break;
+case 18:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17]);
+break;
+case 19:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18]);
+break;
+case 20:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+);
+break;
+case 21:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20]);
+break;
+case 22:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21]);
+break;
+case 23:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22]);
+break;
+case 24:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+);
+break;
+case 25:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24]);
+break;
+case 26:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25]);
+break;
+case 27:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26]);
+break;
+case 28:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+);
+break;
+case 29:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+, args[28]);
+break;
+case 30:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+, args[28], args[29]);
+break;
+case 31:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+, args[28], args[29], args[30]);
+break;
+case 32:
+((void (*)(int32_t *, int32_t *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+, void *, void *, void *, void *
+))fn)(&global_tid, &bound_tid
+, args[0], args[1], args[2], args[3]
+, args[4], args[5], args[6], args[7]
+, args[8], args[9], args[10], args[11]
+, args[12], args[13], args[14], args[15]
+, args[16], args[17], args[18], args[19]
+, args[20], args[21], args[22], args[23]
+, args[24], args[25], args[26], args[27]
+, args[28], args[29], args[30], args[31]
+);
+break;

diff  --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
new file mode 100644
index 000000000000..ac67af144897
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -0,0 +1,44 @@
+//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the data object of the constant device environment and the
+// query API.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Configuration.h"
+#include "State.h"
+#include "Types.h"
+
+using namespace _OMP;
+
+struct DeviceEnvironmentTy {
+  int32_t DebugLevel;
+};
+
+#pragma omp declare target
+
+// TOOD: We want to change the name as soon as the old runtime is gone.
+DeviceEnvironmentTy CONSTANT(omptarget_device_environment)
+    __attribute__((used));
+
+int32_t config::getDebugLevel() {
+  // TODO: Implement libomptarget initialization of DeviceEnvironmentTy
+  return 0;
+}
+
+uint32_t config::getNumDevices() {
+  // TODO: Implement libomptarget initialization of DeviceEnvironmentTy
+  return 1;
+}
+
+bool config::isDebugMode(config::DebugLevel Level) {
+  return config::getDebugLevel() > Level;
+}
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
new file mode 100644
index 000000000000..69d04d69604d
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -0,0 +1,31 @@
+//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug utilities
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Configuration.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+extern "C" {
+void __assert_assume(bool cond, const char *exp, const char *file, int line) {
+  if (!cond && config::isDebugMode(config::DebugLevel::Assertion)) {
+    PRINTF("ASSERTION failed: %s at %s, line %d\n", exp, file, line);
+    __builtin_trap();
+  }
+
+  __builtin_assume(cond);
+}
+}
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
new file mode 100644
index 000000000000..4959177d4293
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -0,0 +1,111 @@
+//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the kernel entry points for the device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+static void inititializeRuntime(bool IsSPMD) {
+  // Order is important here.
+  synchronize::init(IsSPMD);
+  mapping::init(IsSPMD);
+  state::init(IsSPMD);
+}
+
+/// Simple generic state machine for worker threads.
+static void genericStateMachine(IdentTy *Ident) {
+
+  uint32_t TId = mapping::getThreadIdInBlock();
+
+  do {
+    ParallelRegionFnTy WorkFn = 0;
+
+    // Wait for the signal that we have a new work function.
+    synchronize::threads();
+
+    // Retrieve the work function from the runtime.
+    bool IsActive = __kmpc_kernel_parallel(&WorkFn);
+
+    // If there is nothing more to do, break out of the state machine by
+    // returning to the caller.
+    if (!WorkFn)
+      return;
+
+    if (IsActive) {
+      ASSERT(!mapping::isSPMDMode());
+      ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId);
+      __kmpc_kernel_end_parallel();
+    }
+
+    synchronize::threads();
+
+  } while (true);
+}
+
+extern "C" {
+
+/// Initialization
+///
+/// \param Ident               Source location identification, can be NULL.
+///
+int32_t __kmpc_target_init(IdentTy *Ident, bool IsSPMD,
+                           bool UseGenericStateMachine, bool) {
+  if (IsSPMD) {
+    inititializeRuntime(/* IsSPMD */ true);
+    synchronize::threads();
+  } else {
+    inititializeRuntime(/* IsSPMD */ false);
+    // No need to wait since only the main threads will execute user
+    // code and workers will run into a barrier right away.
+  }
+
+  if (IsSPMD) {
+    state::assumeInitialState(IsSPMD);
+    return -1;
+  }
+
+  if (mapping::isMainThreadInGenericMode())
+    return -1;
+
+  if (UseGenericStateMachine)
+    genericStateMachine(Ident);
+
+  return mapping::getThreadIdInBlock();
+}
+
+/// De-Initialization
+///
+/// In non-SPMD, this function releases the workers trapped in a state machine
+/// and also any memory dynamically allocated by the runtime.
+///
+/// \param Ident Source location identification, can be NULL.
+///
+void __kmpc_target_deinit(IdentTy *Ident, bool IsSPMD, bool) {
+  state::assumeInitialState(IsSPMD);
+  if (IsSPMD)
+    return;
+
+  // Signal the workers to exit the state machine and exit the kernel.
+  state::ParallelRegionFn = nullptr;
+}
+
+int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
+}
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
new file mode 100644
index 000000000000..fc3ca637ef4c
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -0,0 +1,221 @@
+//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mapping.h"
+#include "State.h"
+#include "Types.h"
+#include "Utils.h"
+
+#pragma omp declare target
+
+using namespace _OMP;
+
+namespace _OMP {
+namespace impl {
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+uint32_t getGridDim(uint32_t n, uint16_t d) {
+  uint32_t q = n / d;
+  return q + (n > q * d);
+}
+
+uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size,
+                         uint16_t group_size) {
+  uint32_t r = grid_size - group_id * group_size;
+  return (r < group_size) ? r : group_size;
+}
+
+LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
+
+LaneMaskTy lanemaskLT() {
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+  return Mask & Ballot;
+}
+
+LaneMaskTy lanemaskGT() {
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  if (Lane == (mapping::getWarpSize() - 1))
+    return 0;
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+  return Mask & Ballot;
+}
+
+uint32_t getThreadIdInWarp() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
+
+uint32_t getBlockSize() {
+  // TODO: verify this logic for generic mode.
+  return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(),
+                         __builtin_amdgcn_grid_size_x(),
+                         __builtin_amdgcn_workgroup_size_x());
+}
+
+uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); }
+
+uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); }
+
+uint32_t getNumberOfBlocks() {
+  return getGridDim(__builtin_amdgcn_grid_size_x(),
+                    __builtin_amdgcn_workgroup_size_x());
+}
+
+uint32_t getNumberOfProcessorElements() {
+  // TODO
+  return mapping::getBlockSize();
+}
+
+uint32_t getWarpId() {
+  return mapping::getThreadIdInBlock() / mapping::getWarpSize();
+}
+
+uint32_t getWarpSize() { return 64; }
+
+uint32_t getNumberOfWarpsInBlock() {
+  return mapping::getBlockSize() / mapping::getWarpSize();
+}
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+LaneMaskTy activemask() {
+  unsigned int Mask;
+  asm("activemask.b32 %0;" : "=r"(Mask));
+  return Mask;
+}
+
+LaneMaskTy lanemaskLT() {
+  __kmpc_impl_lanemask_t Res;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res));
+  return Res;
+}
+
+LaneMaskTy lanemaskGT() {
+  __kmpc_impl_lanemask_t Res;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res));
+  return Res;
+}
+
+uint32_t getThreadIdInWarp() {
+  return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1);
+}
+
+uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
+
+uint32_t getBlockSize() {
+  return __nvvm_read_ptx_sreg_ntid_x() -
+         (!mapping::isSPMDMode() * mapping::getWarpSize());
+}
+
+uint32_t getKernelSize() { return __nvvm_read_ptx_sreg_nctaid_x(); }
+
+uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); }
+
+uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); }
+
+uint32_t getNumberOfProcessorElements() {
+  return __nvvm_read_ptx_sreg_ntid_x();
+}
+
+uint32_t getWarpId() {
+  return mapping::getThreadIdInBlock() / mapping::getWarpSize();
+}
+
+uint32_t getWarpSize() { return 32; }
+
+uint32_t getNumberOfWarpsInBlock() {
+  return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
+         mapping::getWarpSize();
+}
+
+#pragma omp end declare variant
+///}
+
+} // namespace impl
+} // namespace _OMP
+
+bool mapping::isMainThreadInGenericMode() {
+  if (mapping::isSPMDMode() || icv::Level)
+    return false;
+
+  // Check if this is the last warp in the block.
+  uint32_t MainTId = (mapping::getNumberOfProcessorElements() - 1) &
+                     ~(mapping::getWarpSize() - 1);
+  return mapping::getThreadIdInBlock() == MainTId;
+}
+
+bool mapping::isLeaderInWarp() {
+  __kmpc_impl_lanemask_t Active = mapping::activemask();
+  __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
+  return utils::popc(Active & LaneMaskLT) == 0;
+}
+
+LaneMaskTy mapping::activemask() { return impl::activemask(); }
+
+LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+
+LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+
+uint32_t mapping::getThreadIdInWarp() { return impl::getThreadIdInWarp(); }
+
+uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); }
+
+uint32_t mapping::getBlockSize() { return impl::getBlockSize(); }
+
+uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
+
+uint32_t mapping::getBlockId() { return impl::getBlockId(); }
+
+uint32_t mapping::getNumberOfBlocks() { return impl::getNumberOfBlocks(); }
+
+uint32_t mapping::getNumberOfProcessorElements() {
+  return impl::getNumberOfProcessorElements();
+}
+
+uint32_t mapping::getWarpId() { return impl::getWarpId(); }
+
+uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+
+uint32_t mapping::getNumberOfWarpsInBlock() {
+  return impl::getNumberOfWarpsInBlock();
+}
+
+/// Execution mode
+///
+///{
+static int SHARED(IsSPMDMode);
+
+void mapping::init(bool IsSPMD) {
+  if (!mapping::getThreadIdInBlock())
+    IsSPMDMode = IsSPMD;
+}
+
+bool mapping::isSPMDMode() { return IsSPMDMode; }
+
+bool mapping::isGenericMode() { return !isSPMDMode(); }
+///}
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
new file mode 100644
index 000000000000..44fb85b552af
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -0,0 +1,73 @@
+//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Types.h"
+
+#pragma omp declare target
+
+namespace _OMP {
+namespace impl {
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+double getWTick() { return ((double)1E-9); }
+
+double getWTime() {
+  // The intrinsics for measuring time have undocumented frequency
+  // This will probably need to be found by measurement on a number of
+  // architectures. Until then, return 0, which is very inaccurate as a
+  // timer but resolves the undefined symbol at link time.
+  return 0;
+}
+
+#pragma omp end declare variant
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+double getWTick() {
+  // Timer precision is 1ns
+  return ((double)1E-9);
+}
+
+double getWTime() {
+  unsigned long long nsecs;
+  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
+  return (double)nsecs * getWTick();
+}
+
+#pragma omp end declare variant
+
+} // namespace impl
+} // namespace _OMP
+
+/// Interfaces
+///
+///{
+
+extern "C" {
+int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
+
+int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
+
+double omp_get_wtick(void) { return _OMP::impl::getWTick(); }
+
+double omp_get_wtime(void) { return _OMP::impl::getWTime(); }
+}
+
+///}
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
new file mode 100644
index 000000000000..7b505545f92f
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -0,0 +1,198 @@
+//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Parallel implementation in the GPU. Here is the pattern:
+//
+//    while (not finished) {
+//
+//    if (master) {
+//      sequential code, decide which par loop to do, or if finished
+//     __kmpc_kernel_prepare_parallel() // exec by master only
+//    }
+//    syncthreads // A
+//    __kmpc_kernel_parallel() // exec by all
+//    if (this thread is included in the parallel) {
+//      switch () for all parallel loops
+//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
+//    }
+//
+//
+//    The reason we don't exec end_parallel for the threads not included
+//    in the parallel loop is that for each barrier in the parallel
+//    region, these non-included threads will cycle through the
+//    syncthread A. Thus they must preserve their current threadId that
+//    is larger than thread in team.
+//
+//    To make a long story short...
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+namespace {
+
+uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
+  uint32_t NThreadsICV =
+      NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
+  uint32_t NumThreads = mapping::getBlockSize();
+
+  if (NThreadsICV != 0 && NThreadsICV < NumThreads)
+    NumThreads = NThreadsICV;
+
+  // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
+  if (NumThreads < mapping::getWarpSize())
+    NumThreads = 1;
+  else
+    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+
+  return NumThreads;
+}
+
+// Invoke an outlined parallel function unwrapping arguments (up to 32).
+void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
+                     void **args, int64_t nargs) {
+  switch (nargs) {
+#include "generated_microtask_cases.gen"
+  default:
+    PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
+    __builtin_trap();
+  }
+}
+
+} // namespace
+
+extern "C" {
+
+void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
+                        int32_t num_threads, int proc_bind, void *fn,
+                        void *wrapper_fn, void **args, int64_t nargs) {
+
+  uint32_t TId = mapping::getThreadIdInBlock();
+  // Handle the serialized case first, same for SPMD/non-SPMD.
+  if (OMP_UNLIKELY(!if_expr || icv::Level)) {
+    __kmpc_serialized_parallel(ident, TId);
+    invokeMicrotask(TId, 0, fn, args, nargs);
+    __kmpc_end_serialized_parallel(ident, TId);
+    return;
+  }
+
+  uint32_t NumThreads = determineNumberOfThreads(num_threads);
+  if (mapping::isSPMDMode()) {
+    {
+      state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
+                                            1u, TId == 0);
+      state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0);
+      state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0);
+      synchronize::threads();
+
+      if (TId < NumThreads)
+        invokeMicrotask(TId, 0, fn, args, nargs);
+    }
+    synchronize::threads();
+    return;
+  }
+
+  // We do *not* create a new data environment because all threads in the team
+  // that are active are now running this parallel region. They share the
+  // TeamState, which has an increase level-var and potentially active-level
+  // set, but they do not have individual ThreadStates yet. If they ever
+  // modify the ICVs beyond this point a ThreadStates will be allocated.
+
+  bool IsActiveParallelRegion = NumThreads > 1;
+  if (!IsActiveParallelRegion) {
+    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
+    invokeMicrotask(TId, 0, fn, args, nargs);
+    return;
+  }
+
+  void **GlobalArgs = nullptr;
+  if (nargs) {
+    __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
+#pragma unroll
+    for (int I = 0; I < nargs; I++)
+      GlobalArgs[I] = args[I];
+  }
+
+  {
+    state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
+                                          1u, true);
+    state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
+                                          (void *)nullptr, true);
+    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true);
+    state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
+
+    // Master signals work to activate workers.
+    synchronize::threads();
+    // Master waits for workers to signal.
+    synchronize::threads();
+  }
+
+  if (nargs)
+    memory::freeShared(GlobalArgs, nargs * sizeof(void *),
+                       "global args free shared");
+}
+
+__attribute__((noinline)) bool
+__kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
+  // Work function and arguments for L1 parallel region.
+  *WorkFn = state::ParallelRegionFn;
+
+  // If this is the termination signal from the master, quit early.
+  if (!*WorkFn)
+    return false;
+
+  // Set to true for workers participating in the parallel region.
+  uint32_t TId = mapping::getThreadIdInBlock();
+  bool ThreadIsActive = TId < state::ParallelTeamSize;
+  return ThreadIsActive;
+}
+
+__attribute__((noinline)) void __kmpc_kernel_end_parallel() {
+  // In case we have modified an ICV for this thread before a ThreadState was
+  // created. We drop it now to not contaminate the next parallel region.
+  ASSERT(!mapping::isSPMDMode());
+  uint32_t TId = mapping::getThreadIdInBlock();
+  state::resetStateForThread(TId);
+  ASSERT(!mapping::isSPMDMode());
+}
+
+void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
+  state::enterDataEnvironment();
+  ++icv::Level;
+}
+
+void __kmpc_end_serialized_parallel(IdentTy *, uint32_t TId) {
+  state::exitDataEnvironment();
+  --icv::Level;
+}
+
+uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
+
+int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
+
+void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) {
+  icv::NThreads = NumThreads;
+}
+
+void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
+                           int32_t thread_limit) {}
+
+void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
+}
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
new file mode 100644
index 000000000000..cd5658161a4d
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -0,0 +1,318 @@
+//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of reduction with KMPC interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+namespace {
+
+#pragma omp declare target
+
+void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
+  for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
+    shflFct(reduce_data, /*LaneId - not used= */ 0,
+            /*Offset = */ mask, /*AlgoVersion=*/0);
+  }
+}
+
+void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
+                               uint32_t size, uint32_t tid) {
+  uint32_t curr_size;
+  uint32_t mask;
+  curr_size = size;
+  mask = curr_size / 2;
+  while (mask > 0) {
+    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
+    curr_size = (curr_size + 1) / 2;
+    mask = curr_size / 2;
+  }
+}
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
+                                          ShuffleReductFnTy shflFct) {
+  uint32_t size, remote_id, physical_lane_id;
+  physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
+  __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
+  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+  uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
+  __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
+  do {
+    Liveness = mapping::activemask();
+    remote_id = utils::ffs(Liveness & lanemask_gt);
+    size = utils::popc(Liveness);
+    logical_lane_id /= 2;
+    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
+            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
+  } while (logical_lane_id % 2 == 0 && size > 1);
+  return (logical_lane_id == 0);
+}
+#endif
+
+static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
+                                            uint64_t reduce_size,
+                                            void *reduce_data,
+                                            ShuffleReductFnTy shflFct,
+                                            InterWarpCopyFnTy cpyFct,
+                                            bool isSPMDExecutionMode, bool) {
+  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
+  if (mapping::isMainThreadInGenericMode())
+    BlockThreadId = 0;
+  uint32_t NumThreads = omp_get_num_threads();
+  if (NumThreads == 1)
+    return 1;
+    /*
+     * This reduce function handles reduction within a team. It handles
+     * parallel regions in both L1 and L2 parallelism levels. It also
+     * supports Generic, SPMD, and NoOMP modes.
+     *
+     * 1. Reduce within a warp.
+     * 2. Warp master copies value to warp 0 via shared memory.
+     * 3. Warp 0 reduces to a single value.
+     * 4. The reduced value is available in the thread that returns 1.
+     */
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  uint32_t WarpsNeeded =
+      (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+  uint32_t WarpId = mapping::getWarpId();
+
+  // Volta execution model:
+  // For the Generic execution mode a parallel region either has 1 thread and
+  // beyond that, always a multiple of 32. For the SPMD execution mode we may
+  // have any number of threads.
+  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
+                              /*LaneId=*/mapping::getThreadIdInBlock() %
+                                  mapping::getWarpSize());
+
+  // When we have more than [mapping::getWarpSize()] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > mapping::getWarpSize()) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+  }
+  return BlockThreadId == 0;
+#else
+  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+  if (Liveness == lanes::All) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/utils::popc(Liveness),
+                              /*LaneId=*/mapping::getThreadIdInBlock() %
+                                  mapping::getWarpSize());
+  else { // Dispersed lanes. Only threads in L2
+         // parallel region may enter here; return
+         // early.
+    return gpu_irregular_simd_reduce(reduce_data, shflFct);
+  }
+
+  // When we have more than [mapping::getWarpSize()] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > mapping::getWarpSize()) {
+    uint32_t WarpsNeeded =
+        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+
+    return BlockThreadId == 0;
+  }
+
+  // Get the OMP thread Id. This is 
diff erent from BlockThreadId in the case of
+  // an L2 parallel region.
+  return TId == 0;
+#endif // __CUDA_ARCH__ >= 700
+}
+
+uint32_t roundToWarpsize(uint32_t s) {
+  if (s < mapping::getWarpSize())
+    return 1;
+  return (s & ~(unsigned)(mapping::getWarpSize() - 1));
+}
+
+uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+
+static volatile uint32_t IterCnt = 0;
+static volatile uint32_t Cnt = 0;
+
+} // namespace
+
+extern "C" {
+int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
+  return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
+                                      shflFct, cpyFct, mapping::isSPMDMode(),
+                                      false);
+}
+
+int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
+    ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
+    ListGlobalFnTy glredFct) {
+
+  // Terminate all threads in non-SPMD mode except for the master thread.
+  uint32_t ThreadId = mapping::getThreadIdInBlock();
+  if (mapping::isGenericMode()) {
+    if (!mapping::isMainThreadInGenericMode())
+      return 0;
+    ThreadId = 0;
+  }
+
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads = omp_get_num_threads();
+  uint32_t TeamId = omp_get_team_num();
+  uint32_t NumTeams = omp_get_num_teams();
+  static unsigned SHARED(Bound);
+  static unsigned SHARED(ChunkTeamCount);
+
+  // Block progress for teams greater than the current upper
+  // limit. We always only allow a number of teams less or equal
+  // to the number of slots in the buffer.
+  bool IsMaster = (ThreadId == 0);
+  while (IsMaster) {
+    Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
+    if (TeamId < Bound + num_of_records)
+      break;
+  }
+
+  if (IsMaster) {
+    int ModBockId = TeamId % num_of_records;
+    if (TeamId < num_of_records) {
+      lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
+    } else
+      lgredFct(GlobalBuffer, ModBockId, reduce_data);
+
+    fence::system(__ATOMIC_SEQ_CST);
+
+    // Increment team counter.
+    // This counter is incremented by all teams in the current
+    // BUFFER_SIZE chunk.
+    ChunkTeamCount =
+        atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST);
+  }
+  // Synchronize
+  if (mapping::isSPMDMode())
+    __kmpc_barrier(Loc, TId);
+
+  // reduce_data is global or shared so before being reduced within the
+  // warp we need to bring it in local memory:
+  // local_reduce_data = reduce_data[i]
+  //
+  // Example for 3 reduction variables a, b, c (of potentially 
diff erent
+  // types):
+  //
+  // buffer layout (struct of arrays):
+  // a, a, ..., a, b, b, ... b, c, c, ... c
+  // |__________|
+  //     num_of_records
+  //
+  // local_data_reduce layout (struct):
+  // a, b, c
+  //
+  // Each thread will have a local struct containing the values to be
+  // reduced:
+  //      1. do reduction within each warp.
+  //      2. do reduction across warps.
+  //      3. write the final result to the main reduction variable
+  //         by returning 1 in the thread holding the reduction result.
+
+  // Check if this is the very last team.
+  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
+  if (ChunkTeamCount == NumTeams - Bound - 1) {
+    //
+    // Last team processing.
+    //
+    if (ThreadId >= NumRecs)
+      return 0;
+    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
+    if (ThreadId >= NumThreads)
+      return 0;
+
+    // Load from buffer and reduce.
+    glcpyFct(GlobalBuffer, ThreadId, reduce_data);
+    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
+      glredFct(GlobalBuffer, i, reduce_data);
+
+    // Reduce across warps to the warp master.
+    if (NumThreads > 1) {
+      gpu_regular_warp_reduce(reduce_data, shflFct);
+
+      // When we have more than [mapping::getWarpSize()] number of threads
+      // a block reduction is performed here.
+      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
+      if (ActiveThreads > mapping::getWarpSize()) {
+        uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
+                               mapping::getWarpSize();
+        // Gather all the reduced values from each warp
+        // to the first warp.
+        cpyFct(reduce_data, WarpsNeeded);
+
+        uint32_t WarpId = ThreadId / mapping::getWarpSize();
+        if (WarpId == 0)
+          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                    ThreadId);
+      }
+    }
+
+    if (IsMaster) {
+      Cnt = 0;
+      IterCnt = 0;
+      return 1;
+    }
+    return 0;
+  }
+  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
+    // Allow SIZE number of teams to proceed writing their
+    // intermediate results to the global buffer.
+    atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records),
+                __ATOMIC_SEQ_CST);
+  }
+
+  return 0;
+}
+
+void __kmpc_nvptx_end_reduce(int32_t TId) {}
+
+void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
+}
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
new file mode 100644
index 000000000000..fb472c9bbe4d
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -0,0 +1,519 @@
+//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "State.h"
+#include "Configuration.h"
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+/// Memory implementation
+///
+///{
+
+namespace {
+
+/// Fallback implementations are missing to trigger a link time error.
+/// Implementations for new devices, including the host, should go into a
+/// dedicated begin/end declare variant.
+///
+///{
+
+extern "C" {
+void *malloc(uint64_t Size);
+void free(void *Ptr);
+}
+
+///}
+
+/// AMDGCN implementations of the shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+extern "C" {
+void *malloc(uint64_t Size) {
+  // TODO: Use some preallocated space for dynamic malloc.
+  return nullptr;
+}
+
+void free(void *Ptr) {}
+}
+
+#pragma omp end declare variant
+///}
+
+/// Add worst-case padding so that future allocations are properly aligned.
+constexpr const uint32_t Alignment = 8;
+
+/// A "smart" stack in shared memory.
+///
+/// The stack exposes a malloc/free interface but works like a stack internally.
+/// In fact, it is a separate stack *per warp*. That means, each warp must push
+/// and pop symmetrically or this breaks, badly. The implementation will (aim
+/// to) detect non-lock-step warps and fallback to malloc/free. The same will
+/// happen if a warp runs out of memory. The master warp in generic memory is
+/// special and is given more memory than the rest.
+///
+struct SharedMemorySmartStackTy {
+  /// Initialize the stack. Must be called by all threads.
+  void init(bool IsSPMD);
+
+  /// Allocate \p Bytes on the stack for the encountering thread. Each thread
+  /// can call this function.
+  void *push(uint64_t Bytes);
+
+  /// Deallocate the last allocation made by the encountering thread and pointed
+  /// to by \p Ptr from the stack. Each thread can call this function.
+  void pop(void *Ptr, uint32_t Bytes);
+
+private:
+  /// Compute the size of the storage space reserved for a thread.
+  uint32_t computeThreadStorageTotal() {
+    uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
+    return (state::SharedScratchpadSize - NumLanesInBlock + 1) /
+           NumLanesInBlock;
+  }
+
+  /// Return the top address of the warp data stack, that is the first address
+  /// this warp will allocate memory at next.
+  void *getThreadDataTop(uint32_t TId) {
+    return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
+  }
+
+  /// The actual storage, shared among all warps.
+  unsigned char Data[state::SharedScratchpadSize]
+      __attribute__((aligned(Alignment)));
+  unsigned char Usage[mapping::MaxThreadsPerTeam]
+      __attribute__((aligned(Alignment)));
+};
+
+static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
+              "Shared scratchpad of this size not supported yet.");
+
+/// The allocation of a single shared memory scratchpad.
+static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
+
+void SharedMemorySmartStackTy::init(bool IsSPMD) {
+  Usage[mapping::getThreadIdInBlock()] = 0;
+}
+
+void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
+  // First align the number of requested bytes.
+  uint64_t AlignedBytes = (Bytes + (Alignment - 1)) / Alignment * Alignment;
+
+  uint32_t StorageTotal = computeThreadStorageTotal();
+
+  // The main thread in generic mode gets the space of its entire warp as the
+  // other threads do not participate in any computation at all.
+  if (mapping::isMainThreadInGenericMode())
+    StorageTotal *= mapping::getWarpSize();
+
+  int TId = mapping::getThreadIdInBlock();
+  if (Usage[TId] + AlignedBytes <= StorageTotal) {
+    void *Ptr = getThreadDataTop(TId);
+    Usage[TId] += AlignedBytes;
+    return Ptr;
+  }
+
+  return memory::allocGlobal(AlignedBytes,
+                             "Slow path shared memory allocation, insufficient "
+                             "shared memory stack memory!");
+}
+
+void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
+  uint64_t AlignedBytes = (Bytes + (Alignment - 1)) / Alignment * Alignment;
+  if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
+    int TId = mapping::getThreadIdInBlock();
+    Usage[TId] -= AlignedBytes;
+    return;
+  }
+  memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
+}
+
+} // namespace
+
+void *memory::allocShared(uint64_t Bytes, const char *Reason) {
+  return SharedMemorySmartStack.push(Bytes);
+}
+
+void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
+  SharedMemorySmartStack.pop(Ptr, Bytes);
+}
+
+void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
+  return malloc(Bytes);
+}
+
+void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
+
+///}
+
+namespace {
+
+struct ICVStateTy {
+  uint32_t NThreadsVar;
+  uint32_t LevelVar;
+  uint32_t ActiveLevelVar;
+  uint32_t MaxActiveLevelsVar;
+  uint32_t RunSchedVar;
+  uint32_t RunSchedChunkVar;
+
+  bool operator==(const ICVStateTy &Other) const;
+
+  void assertEqual(const ICVStateTy &Other) const;
+};
+
+bool ICVStateTy::operator==(const ICVStateTy &Other) const {
+  return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
+         (ActiveLevelVar == Other.ActiveLevelVar) &
+         (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
+         (RunSchedVar == Other.RunSchedVar) &
+         (RunSchedChunkVar == Other.RunSchedChunkVar);
+}
+
+void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
+  ASSERT(NThreadsVar == Other.NThreadsVar);
+  ASSERT(LevelVar == Other.LevelVar);
+  ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
+  ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
+  ASSERT(RunSchedVar == Other.RunSchedVar);
+  ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
+}
+
+struct TeamStateTy {
+  /// TODO: provide a proper init function.
+  void init(bool IsSPMD);
+
+  bool operator==(const TeamStateTy &) const;
+
+  void assertEqual(TeamStateTy &Other) const;
+
+  /// ICVs
+  ///
+  /// Preallocated storage for ICV values that are used if the threads have not
+  /// set a custom default. The latter is supported but unlikely and slow(er).
+  ///
+  ///{
+  ICVStateTy ICVState;
+  ///}
+
+  uint32_t ParallelTeamSize;
+  ParallelRegionFnTy ParallelRegionFnVar;
+};
+
+TeamStateTy SHARED(TeamState);
+
+void TeamStateTy::init(bool IsSPMD) {
+  ICVState.NThreadsVar = mapping::getBlockSize();
+  ICVState.LevelVar = 0;
+  ICVState.ActiveLevelVar = 0;
+  ICVState.MaxActiveLevelsVar = 1;
+  ICVState.RunSchedVar = omp_sched_static;
+  ICVState.RunSchedChunkVar = 1;
+  ParallelTeamSize = 1;
+  ParallelRegionFnVar = nullptr;
+}
+
+bool TeamStateTy::operator==(const TeamStateTy &Other) const {
+  return (ICVState == Other.ICVState) &
+         (ParallelTeamSize == Other.ParallelTeamSize);
+}
+
+void TeamStateTy::assertEqual(TeamStateTy &Other) const {
+  ICVState.assertEqual(Other.ICVState);
+  ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
+}
+
+struct ThreadStateTy {
+
+  /// ICVs have preallocated storage in the TeamStateTy which is used if a
+  /// thread has not set a custom value. The latter is supported but unlikely.
+  /// When it happens we will allocate dynamic memory to hold the values of all
+  /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
+  /// ICV struct to hold them all. This is slower than alternatives but allows
+  /// users to pay only for what they use.
+  ///
+  ICVStateTy ICVState;
+
+  ThreadStateTy *PreviousThreadState;
+
+  void init() {
+    ICVState = TeamState.ICVState;
+    PreviousThreadState = nullptr;
+  }
+
+  void init(ThreadStateTy &PreviousTS) {
+    ICVState = PreviousTS.ICVState;
+    PreviousThreadState = &PreviousTS;
+  }
+};
+
+__attribute__((loader_uninitialized))
+ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
+#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
+
+uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) {
+  if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
+    return TeamState.ICVState.*Var;
+  uint32_t TId = mapping::getThreadIdInBlock();
+  if (!ThreadStates[TId]) {
+    ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
+        sizeof(ThreadStateTy), "ICV modification outside data environment"));
+    ThreadStates[TId]->init();
+  }
+  return ThreadStates[TId]->ICVState.*Var;
+}
+
+uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
+  uint32_t TId = mapping::getThreadIdInBlock();
+  if (OMP_UNLIKELY(ThreadStates[TId]))
+    return ThreadStates[TId]->ICVState.*Var;
+  return TeamState.ICVState.*Var;
+}
+uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
+  uint64_t TId = mapping::getThreadIdInBlock();
+  if (OMP_UNLIKELY(ThreadStates[TId]))
+    return ThreadStates[TId]->ICVState.*Var;
+  return TeamState.ICVState.*Var;
+}
+
+int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
+                             int OutOfBoundsVal = -1) {
+  if (Level == 0)
+    return DefaultVal;
+  int LevelVar = omp_get_level();
+  if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
+    return OutOfBoundsVal;
+  int ActiveLevel = icv::ActiveLevel;
+  if (OMP_UNLIKELY(Level != ActiveLevel))
+    return DefaultVal;
+  return Val;
+}
+
+} // namespace
+
+uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) {
+  switch (Kind) {
+  case state::VK_NThreads:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::NThreadsVar);
+    return lookupForModify32Impl(&ICVStateTy::NThreadsVar);
+  case state::VK_Level:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::LevelVar);
+    return lookupForModify32Impl(&ICVStateTy::LevelVar);
+  case state::VK_ActiveLevel:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::ActiveLevelVar);
+    return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar);
+  case state::VK_MaxActiveLevels:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar);
+    return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar);
+  case state::VK_RunSched:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::RunSchedVar);
+    return lookupForModify32Impl(&ICVStateTy::RunSchedVar);
+  case state::VK_RunSchedChunk:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::RunSchedChunkVar);
+    return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar);
+  case state::VK_ParallelTeamSize:
+    return TeamState.ParallelTeamSize;
+  default:
+    break;
+  }
+  __builtin_unreachable();
+}
+
+void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
+  switch (Kind) {
+  case state::VK_ParallelRegionFn:
+    return TeamState.ParallelRegionFnVar;
+  default:
+    break;
+  }
+  __builtin_unreachable();
+}
+
+void state::init(bool IsSPMD) {
+  SharedMemorySmartStack.init(IsSPMD);
+  if (!mapping::getThreadIdInBlock())
+    TeamState.init(IsSPMD);
+
+  ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
+}
+
+void state::enterDataEnvironment() {
+  unsigned TId = mapping::getThreadIdInBlock();
+  ThreadStateTy *NewThreadState =
+      static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
+  NewThreadState->init(*ThreadStates[TId]);
+  ThreadStates[TId] = NewThreadState;
+}
+
+void state::exitDataEnvironment() {
+  unsigned TId = mapping::getThreadIdInBlock();
+  resetStateForThread(TId);
+}
+
+void state::resetStateForThread(uint32_t TId) {
+  if (OMP_LIKELY(!ThreadStates[TId]))
+    return;
+
+  ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
+  __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
+  ThreadStates[TId] = PreviousThreadState;
+}
+
+void state::runAndCheckState(void(Func(void))) {
+  TeamStateTy OldTeamState = TeamState;
+  OldTeamState.assertEqual(TeamState);
+
+  Func();
+
+  OldTeamState.assertEqual(TeamState);
+}
+
+void state::assumeInitialState(bool IsSPMD) {
+  TeamStateTy InitialTeamState;
+  InitialTeamState.init(IsSPMD);
+  InitialTeamState.assertEqual(TeamState);
+  ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
+  ASSERT(mapping::isSPMDMode() == IsSPMD);
+}
+
+extern "C" {
+void omp_set_dynamic(int V) {}
+
+int omp_get_dynamic(void) { return 0; }
+
+void omp_set_num_threads(int V) { icv::NThreads = V; }
+
+int omp_get_max_threads(void) { return icv::NThreads; }
+
+int omp_get_level(void) {
+  int LevelVar = icv::Level;
+  ASSERT(LevelVar >= 0);
+  return LevelVar;
+}
+
+int omp_get_active_level(void) { return !!icv::ActiveLevel; }
+
+int omp_in_parallel(void) { return !!icv::ActiveLevel; }
+
+void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
+  *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
+  *ChunkSize = state::RunSchedChunk;
+}
+
+void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
+  icv::RunSched = (int)ScheduleKind;
+  state::RunSchedChunk = ChunkSize;
+}
+
+int omp_get_ancestor_thread_num(int Level) {
+  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+}
+
+int omp_get_thread_num(void) {
+  return omp_get_ancestor_thread_num(omp_get_level());
+}
+
+int omp_get_team_size(int Level) {
+  return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
+}
+
+int omp_get_num_threads(void) { return state::ParallelTeamSize; }
+
+int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
+
+int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
+
+void omp_set_nested(int) {}
+
+int omp_get_nested(void) { return false; }
+
+void omp_set_max_active_levels(int Levels) {
+  icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
+}
+
+int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
+
+omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
+
+int omp_get_num_places(void) { return 0; }
+
+int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
+
+void omp_get_place_proc_ids(int, int *) {
+  // TODO
+}
+
+int omp_get_place_num(void) { return 0; }
+
+int omp_get_partition_num_places(void) { return 0; }
+
+void omp_get_partition_place_nums(int *) {
+  // TODO
+}
+
+int omp_get_cancellation(void) { return 0; }
+
+void omp_set_default_device(int) {}
+
+int omp_get_default_device(void) { return -1; }
+
+int omp_get_num_devices(void) { return config::getNumDevices(); }
+
+int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
+
+int omp_get_team_num() { return mapping::getBlockId(); }
+
+int omp_get_initial_device(void) { return -1; }
+}
+
+extern "C" {
+__attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
+  return memory::allocShared(Bytes, "Frontend alloc shared");
+}
+
+__attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
+  memory::freeShared(Ptr, Bytes, "Frontend free shared");
+}
+
+[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
+#pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
+    allocator(omp_pteam_mem_alloc)
+
+void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs) {
+  SharedMemVariableSharingSpacePtr =
+      (void **)__kmpc_alloc_shared(sizeof(void *) * NumArgs);
+  *GlobalArgs = SharedMemVariableSharingSpacePtr;
+}
+
+void __kmpc_end_sharing_variables(void **GlobalArgsPtr, uint64_t NumArgs) {
+  __kmpc_free_shared(SharedMemVariableSharingSpacePtr,
+                     sizeof(void *) * NumArgs);
+}
+
+void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  *GlobalArgs = SharedMemVariableSharingSpacePtr;
+}
+}
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
new file mode 100644
index 000000000000..a055ad6d17ec
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -0,0 +1,314 @@
+//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Include all synchronization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Synchronization.h"
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Types.h"
+#include "Utils.h"
+
+#pragma omp declare target
+
+using namespace _OMP;
+
+namespace impl {
+
+/// Atomics
+///
+///{
+/// NOTE: This function needs to be implemented by every target.
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
+
+uint32_t atomicRead(uint32_t *Address, int Ordering) {
+  return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
+}
+
+uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __atomic_fetch_add(Address, Val, Ordering);
+}
+uint32_t atomicMax(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __atomic_fetch_max(Address, Val, Ordering);
+}
+
+uint32_t atomicExchange(uint32_t *Address, uint32_t Val, int Ordering) {
+  uint32_t R;
+  __atomic_exchange(Address, &Val, &R, Ordering);
+  return R;
+}
+uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val,
+                   int Ordering) {
+  (void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering,
+                                  Ordering);
+  return Compare;
+}
+
+uint64_t atomicAdd(uint64_t *Address, uint64_t Val, int Ordering) {
+  return __atomic_fetch_add(Address, Val, Ordering);
+}
+///}
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, "");
+}
+
+uint32_t SHARD(namedBarrierTracker);
+
+void namedBarrierInit() {
+  // Don't have global ctors, and shared memory is not zero init
+  atomic::store(&namedBarrierTracker, 0u, __ATOMIC_RELEASE);
+}
+
+void namedBarrier() {
+  uint32_t NumThreads = omp_get_num_threads();
+  // assert(NumThreads % 32 == 0);
+
+  uint32_t WarpSize = maping::getWarpSize();
+  uint32_t NumWaves = NumThreads / WarpSize;
+
+  fence::team(__ATOMIC_ACQUIRE);
+
+  // named barrier implementation for amdgcn.
+  // Uses two 16 bit unsigned counters. One for the number of waves to have
+  // reached the barrier, and one to count how many times the barrier has been
+  // passed. These are packed in a single atomically accessed 32 bit integer.
+  // Low bits for the number of waves, assumed zero before this call.
+  // High bits to count the number of times the barrier has been passed.
+
+  // precondition: NumWaves != 0;
+  // invariant: NumWaves * WarpSize == NumThreads;
+  // precondition: NumWaves < 0xffffu;
+
+  // Increment the low 16 bits once, using the lowest active thread.
+  if (mapping::isLeaderInWarp()) {
+    uint32_t load = atomic::add(&namedBarrierTracker, 1,
+                                __ATOMIC_RELAXED); // commutative
+
+    // Record the number of times the barrier has been passed
+    uint32_t generation = load & 0xffff0000u;
+
+    if ((load & 0x0000ffffu) == (NumWaves - 1)) {
+      // Reached NumWaves in low bits so this is the last wave.
+      // Set low bits to zero and increment high bits
+      load += 0x00010000u; // wrap is safe
+      load &= 0xffff0000u; // because bits zeroed second
+
+      // Reset the wave counter and release the waiting waves
+      atomic::store(&namedBarrierTracker, load, __ATOMIC_RELAXED);
+    } else {
+      // more waves still to go, spin until generation counter changes
+      do {
+        __builtin_amdgcn_s_sleep(0);
+        load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED);
+      } while ((load & 0xffff0000u) == generation);
+    }
+  }
+  fence::team(__ATOMIC_RELEASE);
+}
+
+void syncWarp(__kmpc_impl_lanemask_t) {
+  // AMDGCN doesn't need to sync threads in a warp
+}
+
+void syncThreads() { __builtin_amdgcn_s_barrier(); }
+
+void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); }
+
+void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); }
+
+void fenceSystem(int Ordering) { __builtin_amdgcn_fence(Ordering, ""); }
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __nvvm_atom_inc_gen_ui(Address, Val);
+}
+
+void namedBarrierInit() {}
+
+void namedBarrier() {
+  uint32_t NumThreads = omp_get_num_threads();
+  ASSERT(NumThreads % 32 == 0);
+
+  // The named barrier for active parallel threads of a team in an L1 parallel
+  // region to synchronize with each other.
+  constexpr int BarrierNo = 7;
+  asm volatile("barrier.sync %0, %1;"
+               :
+               : "r"(BarrierNo), "r"(NumThreads)
+               : "memory");
+}
+
+void fenceTeam(int) { __nvvm_membar_cta(); }
+
+void fenceKernel(int) { __nvvm_membar_gl(); }
+
+void fenceSystem(int) { __nvvm_membar_sys(); }
+
+void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
+
+void syncThreads() {
+  constexpr int BarrierNo = 8;
+  asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
+}
+
+constexpr uint32_t OMP_SPIN = 1000;
+constexpr uint32_t UNSET = 0;
+constexpr uint32_t SET = 1;
+
+// TODO: This seems to hide a bug in the declare variant handling. If it is
+// called before it is defined
+//       here the overload won't happen. Investigate lalter!
+void unsetLock(omp_lock_t *Lock) {
+  (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST);
+}
+
+int testLock(omp_lock_t *Lock) {
+  return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST);
+}
+
+void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void setLock(omp_lock_t *Lock) {
+  // TODO: not sure spinning is a good idea here..
+  while (atomicCAS((uint32_t *)Lock, UNSET, SET, __ATOMIC_SEQ_CST) != UNSET) {
+    int32_t start = __nvvm_read_ptx_sreg_clock();
+    int32_t now;
+    for (;;) {
+      now = __nvvm_read_ptx_sreg_clock();
+      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
+      if (cycles >= OMP_SPIN * mapping::getBlockId()) {
+        break;
+      }
+    }
+  } // wait for 0 to be the read value
+}
+
+#pragma omp end declare variant
+///}
+
+} // namespace impl
+
+void synchronize::init(bool IsSPMD) {
+  if (!IsSPMD)
+    impl::namedBarrierInit();
+}
+
+void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
+
+void synchronize::threads() { impl::syncThreads(); }
+
+void fence::team(int Ordering) { impl::fenceTeam(Ordering); }
+
+void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }
+
+void fence::system(int Ordering) { impl::fenceSystem(Ordering); }
+
+uint32_t atomic::read(uint32_t *Addr, int Ordering) {
+  return impl::atomicRead(Addr, Ordering);
+}
+
+uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
+  return impl::atomicInc(Addr, V, Ordering);
+}
+
+uint32_t atomic::add(uint32_t *Addr, uint32_t V, int Ordering) {
+  return impl::atomicAdd(Addr, V, Ordering);
+}
+
+uint64_t atomic::add(uint64_t *Addr, uint64_t V, int Ordering) {
+  return impl::atomicAdd(Addr, V, Ordering);
+}
+
+extern "C" {
+void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
+
+void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
+
+int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
+  __kmpc_barrier(Loc, TId);
+  return 0;
+}
+
+void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
+  if (mapping::isMainThreadInGenericMode())
+    return __kmpc_flush(Loc);
+
+  if (mapping::isSPMDMode())
+    return __kmpc_barrier_simple_spmd(Loc, TId);
+
+  impl::namedBarrier();
+}
+
+__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
+                                                          int32_t TId) {
+  synchronize::threads();
+}
+
+int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
+  return omp_get_team_num() == 0;
+}
+
+void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
+
+int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
+  return __kmpc_master(Loc, TId);
+}
+
+void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
+  // The barrier is explicitly called.
+}
+
+void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); }
+
+__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
+  return mapping::activemask();
+}
+
+void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { synchronize::warp(Mask); }
+
+void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  omp_set_lock(reinterpret_cast<omp_lock_t *>(Name));
+}
+
+void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  omp_unset_lock(reinterpret_cast<omp_lock_t *>(Name));
+}
+
+void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
+
+void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); }
+
+void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
+
+void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
+
+int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
+} // extern "C"
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
new file mode 100644
index 000000000000..6b6991e772f2
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
@@ -0,0 +1,104 @@
+//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Task implementation support.
+//
+// TODO: We should not allocate and execute the task in two steps. A new API is
+//       needed for that though.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Interface.h"
+#include "State.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
+                                        uint64_t TaskSizeInclPrivateValues,
+                                        uint64_t SharedValuesSize,
+                                        TaskFnTy TaskFn) {
+  auto TaskSizeInclPrivateValuesPadded =
+      utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
+  auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
+  TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
+      TaskSizeTotal, "explicit task descriptor");
+  TaskDescriptor->Payload =
+      utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
+  TaskDescriptor->TaskFn = TaskFn;
+
+  return TaskDescriptor;
+}
+
+int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
+                        TaskDescriptorTy *TaskDescriptor) {
+  return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
+}
+
+int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor, int32_t,
+                                  void *, int32_t, void *) {
+  state::DateEnvironmentRAII DERAII;
+
+  TaskDescriptor->TaskFn(0, TaskDescriptor);
+
+  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
+  return 0;
+}
+
+void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
+                               TaskDescriptorTy *TaskDescriptor) {
+  state::enterDataEnvironment();
+}
+
+void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor) {
+  state::exitDataEnvironment();
+
+  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
+}
+
+void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
+                          void *) {}
+
+void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
+
+void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
+
+int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
+
+int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
+
+void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
+                     TaskDescriptorTy *TaskDescriptor, int,
+                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
+                     int32_t, uint64_t, void *) {
+  // Skip task entirely if empty iteration space.
+  if (*LowerBound > *UpperBound)
+    return;
+
+  // The compiler has already stored lb and ub in the TaskDescriptorTy structure
+  // as we are using a single task to execute the entire loop, we can leave
+  // the initial task_t untouched
+  __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
+}
+
+int omp_in_final(void) {
+  // treat all tasks as final... Specs may expect runtime to keep
+  // track more precisely if a task was actively set by users... This
+  // is not explicitly specified; will treat as if runtime can
+  // actively decide to put a non-final task into a final one.
+  return 1;
+}
+
+int omp_get_max_task_priority(void) { return 0; }
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
new file mode 100644
index 000000000000..24c2d8adc5f7
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -0,0 +1,141 @@
+//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Utils.h"
+
+#include "Interface.h"
+#include "Mapping.h"
+
+#pragma omp declare target
+
+using namespace _OMP;
+
+namespace _OMP {
+/// Helper to keep code alive without introducing a performance penalty.
+__attribute__((used, weak, optnone)) void keepAlive() {
+  __kmpc_barrier_simple_spmd(nullptr, 0);
+}
+} // namespace _OMP
+
+namespace impl {
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
+  *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF));
+  *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+}
+
+uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
+  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
+}
+
+#pragma omp end declare variant
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
+  uint32_t LowBitsLocal, HighBitsLocal;
+  asm("mov.b64 {%0,%1}, %2;"
+      : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
+      : "l"(Val));
+  *LowBits = LowBitsLocal;
+  *HighBits = HighBitsLocal;
+}
+
+uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
+  uint64_t Val;
+  asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
+  return Val;
+}
+
+#pragma omp end declare variant
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  int Width = mapping::getWarpSize();
+  int Self = mapping::getgetThreadIdInWarp();
+  int Index = SrcLane + (Self & ~(Width - 1));
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
+                    int32_t Width) {
+  int Self = mapping::getThreadIdInWarp();
+  int Index = Self + LaneDelta;
+  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
+}
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
+  int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
+}
+
+#pragma omp end declare variant
+} // namespace impl
+
+uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
+  return impl::Pack(LowBits, HighBits);
+}
+
+void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
+  impl::Unpack(Val, &LowBits, &HighBits);
+}
+
+int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  return impl::shuffle(Mask, Var, SrcLane);
+}
+
+int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
+                           int32_t Width) {
+  return impl::shuffleDown(Mask, Var, Delta, Width);
+}
+
+extern "C" {
+int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
+  return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
+}
+
+int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
+  uint32_t lo, hi;
+  utils::unpack(Val, lo, hi);
+  hi = impl::shuffleDown(lanes::All, hi, Delta, Width);
+  lo = impl::shuffleDown(lanes::All, lo, Delta, Width);
+  return utils::pack(lo, hi);
+}
+}
+
+#pragma omp end declare target

diff  --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
new file mode 100644
index 000000000000..12dbeeed912a
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -0,0 +1,598 @@
+//===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+// TODO:
+struct DynamicScheduleTracker {
+  int64_t Chunk;
+  int64_t LoopUpperBound;
+  int64_t NextLowerBound;
+  int64_t Stride;
+  kmp_sched_t ScheduleType;
+  DynamicScheduleTracker *NextDST;
+};
+
+#define ASSERT0(...)
+
+// used by the library for the interface with the app
+#define DISPATCH_FINISHED 0
+#define DISPATCH_NOTFINISHED 1
+
+// used by dynamic scheduling
+#define FINISHED 0
+#define NOT_FINISHED 1
+#define LAST_CHUNK 2
+
+#pragma omp declare target
+
+// TODO: This variable is a hack inherited from the old runtime.
+uint64_t SHARED(Cnt);
+
+template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling with chunk
+
+  // Generic implementation of OMP loop scheduling with static policy
+  /*! \brief Calculate initial bounds for static loop and stride
+   *  @param[in] loc location in code of the call (not used here)
+   *  @param[in] global_tid global thread id
+   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+   *  @param[in] plastiter pointer to last iteration
+   *  @param[in,out] pointer to loop lower bound. it will contain value of
+   *  lower bound of first chunk
+   *  @param[in,out] pointer to loop upper bound. It will contain value of
+   *  upper bound of first chunk
+   *  @param[in,out] pointer to loop stride. It will contain value of stride
+   *  between two successive chunks executed by the same thread
+   *  @param[in] loop increment bump
+   *  @param[in] chunk size
+   */
+
+  // helper function for static chunk
+  static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
+                             T entityId, T numberOfEntities) {
+    // each thread executes multiple chunks all of the same size, except
+    // the last one
+    // distance between two successive chunks
+    stride = numberOfEntities * chunk;
+    lb = lb + entityId * chunk;
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    // Say ub' is the begining of the last chunk. Then who ever has a
+    // lower bound plus a multiple of the increment equal to ub' is
+    // the last one.
+    T beginingLastChunk = inputUb - (inputUb % chunk);
+    last = ((beginingLastChunk - lb) % stride) == 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling without chunk
+
+  // helper function for static no chunk
+  static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
+                               T entityId, T numberOfEntities) {
+    // No chunk size specified.  Each thread or warp gets at most one
+    // chunk; chunks are all almost of equal size
+    T loopSize = ub - lb + 1;
+
+    chunk = loopSize / numberOfEntities;
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = lb <= inputUb && inputUb <= ub;
+    stride = loopSize; // make sure we only do 1 chunk per warp
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for Static Init
+
+  static void for_static_init(int32_t gtid, int32_t schedtype,
+                              int32_t *plastiter, T *plower, T *pupper,
+                              ST *pstride, ST chunk, bool IsSPMDExecutionMode) {
+    // When IsRuntimeUninitialized is true, we assume that the caller is
+    // in an L0 parallel region and that all worker threads participate.
+
+    // Assume we are in teams region or that we use a single block
+    // per target region
+    int numberOfActiveOMPThreads = omp_get_num_threads();
+
+    // All warps that are in excess of the maximum requested, do
+    // not execute the loop
+    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
+            "current thread is not needed here; error");
+
+    // copy
+    int lastiter = 0;
+    T lb = *plower;
+    T ub = *pupper;
+    ST stride = *pstride;
+
+    // init
+    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
+    case kmp_sched_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_balanced_chunk: {
+      if (chunk > 0) {
+        // round up to make sure the chunk is enough to cover all iterations
+        T tripCount = ub - lb + 1; // +1 because ub is inclusive
+        T span = (tripCount + numberOfActiveOMPThreads - 1) /
+                 numberOfActiveOMPThreads;
+        // perform chunk adjustment
+        chunk = (span + chunk - 1) & ~(chunk - 1);
+
+        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+        T oldUb = ub;
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        if (ub > oldUb)
+          ub = oldUb;
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+      break;
+    }
+    case kmp_sched_distr_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
+                       omp_get_num_teams());
+        break;
+      } // note: if chunk <=0, use nochunk
+    }
+    case kmp_sched_distr_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
+                       omp_get_num_teams());
+      break;
+    }
+    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+      ForStaticChunk(lastiter, lb, ub, stride, chunk,
+                     numberOfActiveOMPThreads * omp_get_team_num() + gtid,
+                     omp_get_num_teams() * numberOfActiveOMPThreads);
+      break;
+    }
+    default: {
+      // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                     numberOfActiveOMPThreads);
+      break;
+    }
+    }
+    // copy back
+    *plastiter = lastiter;
+    *plower = lb;
+    *pupper = ub;
+    *pstride = stride;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch Init
+
+  static int OrderedSchedule(kmp_sched_t schedule) {
+    return schedule >= kmp_sched_ordered_first &&
+           schedule <= kmp_sched_ordered_last;
+  }
+
+  static void dispatch_init(IdentTy *loc, int32_t threadId,
+                            kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
+                            DynamicScheduleTracker *DST) {
+    int tid = mapping::getThreadIdInBlock();
+    T tnum = omp_get_num_threads();
+    T tripCount = ub - lb + 1; // +1 because ub is inclusive
+    ASSERT0(LT_FUSSY, threadId < tnum,
+            "current thread is not needed here; error");
+
+    /* Currently just ignore the monotonic and non-monotonic modifiers
+     * (the compiler isn't producing them * yet anyway).
+     * When it is we'll want to look at them somewhere here and use that
+     * information to add to our schedule choice. We shouldn't need to pass
+     * them on, they merely affect which schedule we can legally choose for
+     * various dynamic cases. (In particular, whether or not a stealing scheme
+     * is legal).
+     */
+    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+    // Process schedule.
+    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+      if (OrderedSchedule(schedule))
+        __kmpc_barrier(loc, threadId);
+      schedule = kmp_sched_static_chunk;
+      chunk = tripCount; // one thread gets the whole loop
+    } else if (schedule == kmp_sched_runtime) {
+      // process runtime
+      omp_sched_t rtSched;
+      int ChunkInt;
+      omp_get_schedule(&rtSched, &ChunkInt);
+      chunk = ChunkInt;
+      switch (rtSched) {
+      case omp_sched_static: {
+        if (chunk > 0)
+          schedule = kmp_sched_static_chunk;
+        else
+          schedule = kmp_sched_static_nochunk;
+        break;
+      }
+      case omp_sched_auto: {
+        schedule = kmp_sched_static_chunk;
+        chunk = 1;
+        break;
+      }
+      case omp_sched_dynamic:
+      case omp_sched_guided: {
+        schedule = kmp_sched_dynamic;
+        break;
+      }
+      }
+    } else if (schedule == kmp_sched_auto) {
+      schedule = kmp_sched_static_chunk;
+      chunk = 1;
+    } else {
+      // ASSERT(LT_FUSSY,
+      //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+      //        "unknown schedule %d & chunk %lld\n", (int)schedule,
+      //        (long long)chunk);
+    }
+
+    // init schedules
+    if (schedule == kmp_sched_static_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      DST->ScheduleType = schedule;
+      // save ub
+      DST->LoopUpperBound = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
+    } else if (schedule == kmp_sched_static_balanced_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      DST->ScheduleType = schedule;
+      // save ub
+      DST->LoopUpperBound = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      // round up to make sure the chunk is enough to cover all iterations
+      T span = (tripCount + tnum - 1) / tnum;
+      // perform chunk adjustment
+      chunk = (span + chunk - 1) & ~(chunk - 1);
+
+      T oldUb = ub;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+      if (ub > oldUb)
+        ub = oldUb;
+      // save computed params
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
+    } else if (schedule == kmp_sched_static_nochunk) {
+      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      DST->ScheduleType = schedule;
+      // save ub
+      DST->LoopUpperBound = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
+    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+      // save data
+      DST->ScheduleType = schedule;
+      if (chunk < 1)
+        chunk = 1;
+      DST->Chunk = chunk;
+      DST->LoopUpperBound = ub;
+      DST->NextLowerBound = lb;
+      __kmpc_barrier(loc, threadId);
+      if (tid == 0) {
+        Cnt = 0;
+        fence::team(__ATOMIC_SEQ_CST);
+      }
+      __kmpc_barrier(loc, threadId);
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch next
+
+  static uint64_t NextIter() {
+    __kmpc_impl_lanemask_t active = mapping::activemask();
+    uint32_t leader = utils::ffs(active) - 1;
+    uint32_t change = utils::popc(active);
+    __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
+    unsigned int rank = utils::popc(active & lane_mask_lt);
+    uint64_t warp_res;
+    if (rank == 0) {
+      warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST);
+    }
+    warp_res = utils::shuffle(active, warp_res, leader);
+    return warp_res + rank;
+  }
+
+  static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
+                              T loopUpperBound) {
+    T N = NextIter();
+    lb = loopLowerBound + N * chunkSize;
+    ub = lb + chunkSize - 1; // Clang uses i <= ub
+
+    // 3 result cases:
+    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
+    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
+    //  NOT_FINISHED
+    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+    // a.
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
+      return NOT_FINISHED;
+    }
+    // b.
+    if (lb <= loopUpperBound) {
+      ub = loopUpperBound;
+      return LAST_CHUNK;
+    }
+    // c. if we are here, we are in case 'c'
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
+    return FINISHED;
+  }
+
+  static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
+                           T *plower, T *pupper, ST *pstride,
+                           DynamicScheduleTracker *DST) {
+    // ID of a thread in its own warp
+
+    // automatically selects thread or warp ID based on selected implementation
+    ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
+            "current thread is not needed here; error");
+    // retrieve schedule
+    kmp_sched_t schedule = DST->ScheduleType;
+
+    // xxx reduce to one
+    if (schedule == kmp_sched_static_chunk ||
+        schedule == kmp_sched_static_nochunk) {
+      T myLb = DST->NextLowerBound;
+      T ub = DST->LoopUpperBound;
+      // finished?
+      if (myLb > ub) {
+        return DISPATCH_FINISHED;
+      }
+      // not finished, save current bounds
+      ST chunk = DST->Chunk;
+      *plower = myLb;
+      T myUb = myLb + chunk - 1; // Clang uses i <= ub
+      if (myUb > ub)
+        myUb = ub;
+      *pupper = myUb;
+      *plast = (int32_t)(myUb == ub);
+
+      // increment next lower bound by the stride
+      ST stride = DST->Stride;
+      DST->NextLowerBound = myLb + stride;
+      return DISPATCH_NOTFINISHED;
+    }
+    ASSERT0(LT_FUSSY,
+            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+            "bad sched");
+    T myLb, myUb;
+    int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
+                                    DST->LoopUpperBound);
+
+    if (finished == FINISHED)
+      return DISPATCH_FINISHED;
+
+    // not finished (either not finished or last chunk)
+    *plast = (int32_t)(finished == LAST_CHUNK);
+    *plower = myLb;
+    *pupper = myUb;
+    *pstride = 1;
+
+    return DISPATCH_NOTFINISHED;
+  }
+
+  static void dispatch_fini() {
+    // nothing
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // end of template class that encapsulate all the helper functions
+  ////////////////////////////////////////////////////////////////////////////////
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// TODO: This is a stopgap. We probably want to expand the dispatch API to take
+//       an DST pointer which can then be allocated properly without malloc.
+DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
+
+// Create a new DST, link the current one, and define the new as current.
+static DynamicScheduleTracker *pushDST() {
+  DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
+      memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
+  *NewDST = DynamicScheduleTracker({0});
+  NewDST->NextDST = ThreadDSTPtr;
+  ThreadDSTPtr = NewDST;
+  return ThreadDSTPtr;
+}
+
+// Return the current DST.
+static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
+
+// Pop the current DST and restore the last one.
+static void popDST() {
+  DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
+  memory::freeGlobal(ThreadDSTPtr, "remove DST");
+  ThreadDSTPtr = OldDST;
+}
+
+extern "C" {
+
+// init
+void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
+                            int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
+                             uint32_t lb, uint32_t ub, int32_t st,
+                             int32_t chunk) {
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
+                            int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
+                             uint64_t lb, uint64_t ub, int64_t st,
+                             int64_t chunk) {
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+// next
+int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
+                           int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
+                            uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
+                           int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
+                            uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+// fini
+void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+  popDST();
+}
+
+void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+  popDST();
+}
+
+void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+  popDST();
+}
+
+void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+  popDST();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
+                              int32_t schedtype, int32_t *plastiter,
+                              int32_t *plower, int32_t *pupper,
+                              int32_t *pstride, int32_t incr, int32_t chunk) {
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
+                               int32_t schedtype, int32_t *plastiter,
+                               uint32_t *plower, uint32_t *pupper,
+                               int32_t *pstride, int32_t incr, int32_t chunk) {
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
+                              int32_t schedtype, int32_t *plastiter,
+                              int64_t *plower, int64_t *pupper,
+                              int64_t *pstride, int64_t incr, int64_t chunk) {
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
+                               int32_t schedtype, int32_t *plastiter,
+                               uint64_t *plower, uint64_t *pupper,
+                               int64_t *pstride, int64_t incr, int64_t chunk) {
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {}
+}
+
+#pragma omp end declare target