[Openmp-commits] [openmp] 034adaf - [OpenMP] Completely remove old device runtime
Joseph Huber via Openmp-commits
openmp-commits at lists.llvm.org
Fri Feb 4 12:31:44 PST 2022
Author: Joseph Huber
Date: 2022-02-04T15:31:33-05:00
New Revision: 034adaf5be4bfb5d2d61b20ea045ce97979c826a
URL: https://github.com/llvm/llvm-project/commit/034adaf5be4bfb5d2d61b20ea045ce97979c826a
DIFF: https://github.com/llvm/llvm-project/commit/034adaf5be4bfb5d2d61b20ea045ce97979c826a.diff
LOG: [OpenMP] Completely remove old device runtime
This patch completely removes the old OpenMP device runtime. Previously,
the old runtime had the prefix `libomptarget-new-` and the old runtime
was simply called `libomptarget-`. This patch makes the formerly new
runtime the only runtime available. The entire project has been deleted,
and all references to the `libomptarget-new` runtime has been replaced
with `libomptarget-`.
Reviewed By: JonChesterfield
Differential Revision: https://reviews.llvm.org/D118934
Added:
Modified:
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/lib/Driver/ToolChains/Cuda.cpp
clang/lib/Frontend/CompilerInvocation.cpp
clang/test/Driver/amdgpu-openmp-toolchain.c
clang/test/Driver/openmp-offload-gpu.c
clang/test/OpenMP/target_globals_codegen.cpp
openmp/libomptarget/CMakeLists.txt
openmp/libomptarget/DeviceRTL/CMakeLists.txt
openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
openmp/libomptarget/plugins/cuda/CMakeLists.txt
openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
openmp/libomptarget/test/lit.cfg
openmp/libomptarget/test/mapping/data_member_ref.cpp
openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
openmp/libomptarget/test/mapping/lambda_by_value.cpp
openmp/libomptarget/test/mapping/lambda_mapping.cpp
openmp/libomptarget/test/mapping/ompx_hold/struct.c
openmp/libomptarget/test/offloading/bug49021.cpp
openmp/libomptarget/test/offloading/bug49334.cpp
openmp/libomptarget/test/offloading/bug51781.c
openmp/libomptarget/test/offloading/global_constructor.cpp
openmp/libomptarget/test/offloading/host_as_target.c
openmp/libomptarget/test/unified_shared_memory/api.c
openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
openmp/libomptarget/test/unified_shared_memory/close_modifier.c
openmp/libomptarget/test/unified_shared_memory/shared_update.c
Removed:
openmp/libomptarget/deviceRTLs/CMakeLists.txt
openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
openmp/libomptarget/deviceRTLs/common/allocator.h
openmp/libomptarget/deviceRTLs/common/debug.h
openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen
openmp/libomptarget/deviceRTLs/common/include/target.h
openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
openmp/libomptarget/deviceRTLs/common/omptarget.h
openmp/libomptarget/deviceRTLs/common/omptargeti.h
openmp/libomptarget/deviceRTLs/common/src/cancel.cu
openmp/libomptarget/deviceRTLs/common/src/critical.cu
openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
openmp/libomptarget/deviceRTLs/common/src/libcall.cu
openmp/libomptarget/deviceRTLs/common/src/loop.cu
openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
openmp/libomptarget/deviceRTLs/common/src/parallel.cu
openmp/libomptarget/deviceRTLs/common/src/reduction.cu
openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp
openmp/libomptarget/deviceRTLs/common/src/support.cu
openmp/libomptarget/deviceRTLs/common/src/sync.cu
openmp/libomptarget/deviceRTLs/common/src/task.cu
openmp/libomptarget/deviceRTLs/common/state-queue.h
openmp/libomptarget/deviceRTLs/common/state-queuei.h
openmp/libomptarget/deviceRTLs/common/support.h
openmp/libomptarget/deviceRTLs/interface.h
openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg
openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c
openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
openmp/libomptarget/deviceRTLs/target_interface.h
################################################################################
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 2d55113368517..5f2aea518acae 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1203,8 +1203,7 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
llvm_unreachable("OpenMP can only handle device code.");
llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
- if (CGM.getLangOpts().OpenMPTargetNewRuntime &&
- !CGM.getLangOpts().OMPHostIRFile.empty()) {
+ if (!CGM.getLangOpts().OMPHostIRFile.empty()) {
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
"__omp_rtl_debug_kind");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index d7cf41e4b6605..b68b026cd988d 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -290,11 +290,7 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions(
return;
std::string BitcodeSuffix;
- if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
- options::OPT_fno_openmp_target_new_runtime, true))
- BitcodeSuffix = "new-amdgpu-" + GPUArch;
- else
- BitcodeSuffix = "amdgcn-" + GPUArch;
+ BitcodeSuffix = "amdgcn-" + GPUArch;
addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
getTriple());
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 5b2984ea2496f..fb508d8b905b1 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5936,13 +5936,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
options::OPT_fno_openmp_cuda_mode, /*Default=*/false))
CmdArgs.push_back("-fopenmp-cuda-mode");
- // When in OpenMP offloading mode, enable or disable the new device
- // runtime.
- if (Args.hasFlag(options::OPT_fopenmp_target_new_runtime,
- options::OPT_fno_openmp_target_new_runtime,
- /*Default=*/true))
- CmdArgs.push_back("-fopenmp-target-new-runtime");
-
// When in OpenMP offloading mode, enable debugging on the device.
Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_target_debug_EQ);
if (Args.hasFlag(options::OPT_fopenmp_target_debug,
@@ -8187,9 +8180,6 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ);
std::string BitcodeSuffix;
- if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
- options::OPT_fno_openmp_target_new_runtime, true))
- BitcodeSuffix += "new-";
if (TC->getTriple().isNVPTX())
BitcodeSuffix += "nvptx-";
else if (TC->getTriple().isAMDGPU())
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 4a9f6d4c4e3e4..14e7d26ba1c06 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -749,11 +749,7 @@ void CudaToolChain::addClangTargetOptions(
return;
std::string BitcodeSuffix;
- if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
- options::OPT_fno_openmp_target_new_runtime, true))
- BitcodeSuffix = "new-nvptx-" + GpuArch.str();
- else
- BitcodeSuffix = "nvptx-" + GpuArch.str();
+ BitcodeSuffix = "nvptx-" + GpuArch.str();
addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
getTriple());
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 7bf45bb281165..c42cae941634f 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3484,9 +3484,6 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
GenerateArg(Args, OPT_fopenmp_version_EQ, Twine(Opts.OpenMP), SA);
}
- if (Opts.OpenMPTargetNewRuntime)
- GenerateArg(Args, OPT_fopenmp_target_new_runtime, SA);
-
if (Opts.OpenMPThreadSubscription)
GenerateArg(Args, OPT_fopenmp_assume_threads_oversubscription, SA);
@@ -3877,9 +3874,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
Opts.OpenMP && Args.hasArg(options::OPT_fopenmp_enable_irbuilder);
bool IsTargetSpecified =
Opts.OpenMPIsDevice || Args.hasArg(options::OPT_fopenmp_targets_EQ);
- Opts.OpenMPTargetNewRuntime =
- Opts.OpenMPIsDevice &&
- Args.hasArg(options::OPT_fopenmp_target_new_runtime);
Opts.ConvergentFunctions = Opts.ConvergentFunctions || Opts.OpenMPIsDevice;
@@ -3927,17 +3921,13 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
// Set either by a specific value or to a default if not specified.
if (Opts.OpenMPIsDevice && (Args.hasArg(OPT_fopenmp_target_debug) ||
Args.hasArg(OPT_fopenmp_target_debug_EQ))) {
- if (Opts.OpenMPTargetNewRuntime) {
- Opts.OpenMPTargetDebug = getLastArgIntValue(
- Args, OPT_fopenmp_target_debug_EQ, Opts.OpenMPTargetDebug, Diags);
- if (!Opts.OpenMPTargetDebug && Args.hasArg(OPT_fopenmp_target_debug))
- Opts.OpenMPTargetDebug = 1;
- } else {
- Diags.Report(diag::err_drv_debug_no_new_runtime);
- }
+ Opts.OpenMPTargetDebug = getLastArgIntValue(
+ Args, OPT_fopenmp_target_debug_EQ, Opts.OpenMPTargetDebug, Diags);
+ if (!Opts.OpenMPTargetDebug && Args.hasArg(OPT_fopenmp_target_debug))
+ Opts.OpenMPTargetDebug = 1;
}
- if (Opts.OpenMPIsDevice && Opts.OpenMPTargetNewRuntime) {
+ if (Opts.OpenMPIsDevice) {
if (Args.hasArg(OPT_fopenmp_assume_teams_oversubscription))
Opts.OpenMPTeamSubscription = true;
if (Args.hasArg(OPT_fopenmp_assume_threads_oversubscription))
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
index f6b22cd5973a8..78fee12a5a98c 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -1,6 +1,6 @@
// REQUIRES: x86-registered-target
// REQUIRES: amdgpu-registered-target
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
// RUN: | FileCheck %s
// verify the tools invocations
@@ -14,7 +14,7 @@
// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a-{{.*}}.o" "-x" "ir" "{{.*}}a-{{.*}}.bc"
// CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a-{{.*}}.o" "-lomp" "-lomptarget"
-// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
+// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
// RUN: | FileCheck --check-prefix=CHECK-PHASES %s
// phases
// CHECK-PHASES: 0: input, "{{.*}}amdgpu-openmp-toolchain.c", c, (host-openmp)
@@ -36,13 +36,13 @@
// CHECK-PHASES: 16: linker, {4, 15}, image, (host-openmp)
// handling of --libomptarget-amdgcn-bc-path
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
// CHECK-LIBOMPTARGET: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc"{{.*}}
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB
// CHECK-NOGPULIB-NOT: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}libomptarget-amdgcn-gfx803.bc"{{.*}}
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"],
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang",{{.*}} output: "[[HOST_BC:.*]]"
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]"], output: "[[HOST_S:.*]]"
@@ -56,13 +56,13 @@
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["[[HOST_O]]", "[[OFFLOAD_O]]"], output:
// verify the llc is invoked for textual assembly output
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
// RUN: | FileCheck %s --check-prefix=CHECK-SAVE-ASM
// CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=asm" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.s"
// CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.o"
// check the handling of -c
-// RUN: %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
+// RUN: %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
// RUN: | FileCheck %s --check-prefix=CHECK-C
// CHECK-C: "x86_64-unknown-linux-gnu" - "clang",
// CHECK-C: "x86_64-unknown-linux-gnu" - "clang",{{.*}}output: "[[HOST_BC:.*]]"
@@ -72,8 +72,8 @@
// CHECK-C: "x86_64-unknown-linux-gnu" - "clang::as"
// CHECK-C: "x86_64-unknown-linux-gnu" - "offload bundler"
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
// CHECK-EMIT-LLVM-IR: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm"
-// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE
+// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE
// CHECK-LIB-DEVICE: {{.*}}llvm-link{{.*}}ocml.bc"{{.*}}ockl.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index af7ba7c802e5d..810f6ca2bcffa 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -155,43 +155,24 @@
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime -save-temps -no-canonical-prefixes %s 2>&1 \
+// RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-BCLIB %s
/// Specify the directory containing the bitcode lib, check clang picks the right one
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget \
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN: -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \
+// RUN: -fopenmp-relocatable-target -save-temps \
// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-DIR %s
-/// Check with the new runtime enabled
-// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime \
-// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-new-nvptx-test.bc \
-// RUN: -save-temps -no-canonical-prefixes %s 2>&1 \
-// RUN: | FileCheck -check-prefix=CHK-BCLIB-NEW %s
-
-/// Check with new runtime and specifying the directory
-// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-
-// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime \
-// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget -save-temps \
-// RUN: -no-canonical-prefixes %s 2>&1 \
-// RUN: | FileCheck -check-prefix=CHK-BCLIB-NEW-DIR %s
-
/// Create a bogus bitcode library and find it with LIBRARY_PATH
// RUN: env LIBRARY_PATH=%S/Inputs/libomptarget/subdir %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN: -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \
+// RUN: -fopenmp-relocatable-target -save-temps \
// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-ENV-BCLIB %s
// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx-sm_35.bc
-// CHK-BCLIB-NEW: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-new-nvptx-test.bc
-// CHK-BCLIB-NEW-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-new-nvptx-sm_35.bc
// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx-sm_35.bc
// CHK-BCLIB-NOT: {{error:|warning:}}
@@ -204,7 +185,7 @@
// RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-BCLIB-WARN %s
-// CHK-BCLIB-WARN: no library 'libomptarget-new-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library
+// CHK-BCLIB-WARN: no library 'libomptarget-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library
/// ###########################################################################
diff --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp
index 7aec922987574..fa7569cd4ca6b 100644
--- a/clang/test/OpenMP/target_globals_codegen.cpp
+++ b/clang/test/OpenMP/target_globals_codegen.cpp
@@ -1,12 +1,12 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --global-value-regex "__omp_rtl_"
// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
// expected-no-diagnostics
#ifndef HEADER
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 63f0801025e5e..0dbacd81b0082 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -38,13 +38,11 @@ endif()
# This is a list of all the targets that are supported/tested right now.
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newRTL")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newDriver")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
-set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newRTL")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newDriver")
# Once the plugins for the
diff erent targets are validated, they will be added to
@@ -81,7 +79,6 @@ set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${LIBOMP_LIBRARY_DIR}" CACHE STRING
# Build offloading plugins and device RTLs if they are available.
add_subdirectory(plugins)
-add_subdirectory(deviceRTLs)
add_subdirectory(DeviceRTL)
add_subdirectory(tools)
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
index d8b9e40802b85..ec586959868a1 100644
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -180,7 +180,7 @@ function(compileDeviceRTLLibrary target_cpu target_name)
list(APPEND bc_files ${outfile})
endforeach()
- set(bclib_name "libomptarget-new-${target_name}-${target_cpu}.bc")
+ set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
# Link to a bitcode library.
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
@@ -212,7 +212,7 @@ function(compileDeviceRTLLibrary target_cpu target_name)
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
- set(bclib_target_name "omptarget-new-${target_name}-${target_cpu}-bc")
+ set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")
add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
diff --git a/openmp/libomptarget/deviceRTLs/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/CMakeLists.txt
deleted file mode 100644
index 8bbf987aaf205..0000000000000
--- a/openmp/libomptarget/deviceRTLs/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ##===----------------------------------------------------------------------===##
-#
-# Build a device RTL for each available machine.
-#
-##===----------------------------------------------------------------------===##
-
-add_subdirectory(amdgcn)
-add_subdirectory(nvptx)
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
deleted file mode 100644
index 4060130730248..0000000000000
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ /dev/null
@@ -1,193 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build the AMDGCN Device RTL bitcode library using clang -ffreestanding
-#
-##===----------------------------------------------------------------------===##
-
-set(LIBOMPTARGET_BUILD_AMDGCN_BCLIB FALSE CACHE BOOL
- "Can be set to true to enable building this library.")
-
-if (NOT LIBOMPTARGET_BUILD_AMDGCN_BCLIB)
- libomptarget_say("Not building AMDGCN device RTL: Disabled by LIBOMPTARGET_BUILD_AMDGCN_BCLIB")
- return()
-endif()
-
-if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
- libomptarget_say("Not building AMDGCN device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
- return()
-endif()
-
-
-# Copied from nvptx CMakeLists
-if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
- set(aux_triple x86_64-unknown-linux-gnu)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
- set(aux_triple powerpc64le-unknown-linux-gnu)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
- set(aux_triple aarch64-unknown-linux-gnu)
-else()
- libomptarget_say("Not building AMDGCN device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
- return()
-endif()
-
-if (LLVM_DIR)
- # Builds that use pre-installed LLVM have LLVM_DIR set.
- find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
- find_program(LINK_TOOL llvm-link PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
- find_program(OPT_TOOL opt PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
- if ((NOT CLANG_TOOL) OR (NOT LINK_TOOL) OR (NOT OPT_TOOL))
- libomptarget_say("Not building AMDGCN device RTL. Missing clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} or opt: ${OPT_TOOL}")
- return()
- else()
- libomptarget_say("Building AMDGCN device RTL. Using clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} and opt: ${OPT_TOOL}")
- endif()
-elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD)
- # LLVM in-tree builds may use CMake target names to discover the tools.
- set(CLANG_TOOL $<TARGET_FILE:clang>)
- set(LINK_TOOL $<TARGET_FILE:llvm-link>)
- set(OPT_TOOL $<TARGET_FILE:opt>)
- libomptarget_say("Building AMDGCN device RTL. Using clang from in-tree build")
-else()
- libomptarget_say("Not building AMDGCN device RTL. No appropriate clang found")
- return()
-endif()
-
-project(omptarget-amdgcn)
-
-add_custom_target(omptarget-amdgcn ALL)
-
-#optimization level
-set(optimization_level 2)
-
-# Activate RTL message dumps if requested by the user.
-if(LIBOMPTARGET_NVPTX_DEBUG)
- set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g)
-endif()
-
-get_filename_component(devicertl_base_directory
- ${CMAKE_CURRENT_SOURCE_DIR}
- DIRECTORY)
-
-set(cuda_sources
- ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip
- ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip
- ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip
- ${devicertl_base_directory}/common/src/cancel.cu
- ${devicertl_base_directory}/common/src/critical.cu
- ${devicertl_base_directory}/common/src/data_sharing.cu
- ${devicertl_base_directory}/common/src/libcall.cu
- ${devicertl_base_directory}/common/src/loop.cu
- ${devicertl_base_directory}/common/src/omp_data.cu
- ${devicertl_base_directory}/common/src/omptarget.cu
- ${devicertl_base_directory}/common/src/parallel.cu
- ${devicertl_base_directory}/common/src/reduction.cu
- ${devicertl_base_directory}/common/src/support.cu
- ${devicertl_base_directory}/common/src/shuffle.cpp
- ${devicertl_base_directory}/common/src/sync.cu
- ${devicertl_base_directory}/common/src/task.cu)
-
-set(h_files
- ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
- ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
- ${devicertl_base_directory}/common/debug.h
- ${devicertl_base_directory}/common/omptarget.h
- ${devicertl_base_directory}/common/omptargeti.h
- ${devicertl_base_directory}/common/state-queue.h
- ${devicertl_base_directory}/common/state-queuei.h
- ${devicertl_base_directory}/common/support.h)
-
-# for both in-tree and out-of-tree build
-if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
- set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-else()
- set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY})
-endif()
-
-# create gfx bitcode libraries
-set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908 gfx90a gfx1010 gfx1030 gfx1031)
-if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
- set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
-endif()
-
-# Prepend -I to each list element
-set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
-list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN PREPEND "-I")
-
-macro(add_cuda_bc_library)
- set(cu_cmd ${CLANG_TOOL}
- -xc++
- -c
- -mllvm -openmp-opt-disable
- -std=c++14
- -ffreestanding
- -target amdgcn-amd-amdhsa
- -emit-llvm
- -Xclang -aux-triple -Xclang ${aux_triple}
- -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
- -D__AMDGCN__
- -Xclang -target-cpu -Xclang ${mcpu}
- -fvisibility=hidden
- -Wno-unused-value
- -nogpulib
- -O${optimization_level}
- ${CUDA_DEBUG}
- -I${CMAKE_CURRENT_SOURCE_DIR}/src
- -I${devicertl_base_directory}/common/include
- -I${devicertl_base_directory}
- -I${devicertl_base_directory}/../include
- ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN})
-
- set(bc1_files)
-
- foreach(file ${ARGN})
- get_filename_component(fname ${file} NAME_WE)
- set(bc1_filename ${fname}.${mcpu}.bc)
-
- add_custom_command(
- OUTPUT ${bc1_filename}
- COMMAND ${cu_cmd} ${file} -o ${bc1_filename}
- DEPENDS ${file} ${h_files})
-
- list(APPEND bc1_files ${bc1_filename})
- endforeach()
-
- add_custom_command(
- OUTPUT linkout.cuda.${mcpu}.bc
- COMMAND ${LINK_TOOL} ${bc1_files} -o linkout.cuda.${mcpu}.bc
- DEPENDS ${bc1_files})
-
- list(APPEND bc_files linkout.cuda.${mcpu}.bc)
-endmacro()
-
-set(libname "omptarget-amdgcn")
-
-set(toolchain_deps "")
-if(TARGET llvm-link)
- list(APPEND toolchain_deps llvm-link)
-endif()
-if(TARGET opt)
- list(APPEND toolchain_deps opt)
-endif()
-
-foreach(mcpu ${mcpus})
- set(bc_files)
- add_cuda_bc_library(${cuda_sources})
-
- set(bc_libname lib${libname}-${mcpu}.bc)
- add_custom_command(
- OUTPUT ${bc_libname}
- COMMAND ${LINK_TOOL} ${bc_files} | ${OPT_TOOL} --always-inline -o ${OUTPUTDIR}/${bc_libname}
- DEPENDS ${bc_files} ${toolchain_deps})
-
- add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
-
- install(FILES ${OUTPUTDIR}/${bc_libname}
- DESTINATION "${OPENMP_INSTALL_LIBDIR}"
- )
-endforeach()
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
deleted file mode 100644
index c3b2f59d636e2..0000000000000
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
+++ /dev/null
@@ -1,19 +0,0 @@
-//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _AMDGCN_INTERFACE_H_
-#define _AMDGCN_INTERFACE_H_
-
-#include <stdint.h>
-
-#define EXTERN extern "C"
-typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
-
-EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads();
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
deleted file mode 100644
index 2261505476108..0000000000000
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock
-// cannot be implemented - if one thread gets the lock, it can't continue on to
-// the next instruction in order to do anything as the other threads are waiting
-// to take the lock.
-// These functions will be implemented to provide the documented semantics for
-// a SIMD => wavefront mapping once that is implemented.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-
-static void warn() {
- PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");
-}
-
-void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
-void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
-void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
-void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
-int __kmpc_impl_test_lock(omp_lock_t *lock) {
- warn();
- return 0;
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
deleted file mode 100644
index f18f8b5a70c86..0000000000000
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip
+++ /dev/null
@@ -1,64 +0,0 @@
-//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "target_impl.h"
-
-// Partially derived fom hcc_detail/device_functions.h
-
-// HW_ID Register bit structure
-// WAVE_ID 3:0 Wave buffer slot number. 0-9.
-// SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
-// PIPE_ID 7:6 Pipeline from which the wave was dispatched.
-// CU_ID 11:8 Compute Unit the wave is assigned to.
-// SH_ID 12 Shader Array (within an SE) the wave is assigned to.
-// SE_ID 14:13 Shader Engine the wave is assigned to.
-// TG_ID 19:16 Thread-group ID
-// VM_ID 23:20 Virtual Memory ID
-// QUEUE_ID 26:24 Queue from which this wave was dispatched.
-// STATE_ID 29:27 State ID (graphics only, not compute).
-// ME_ID 31:30 Micro-engine ID.
-
-enum {
- HW_ID = 4, // specify that the hardware register to read is HW_ID
-
- HW_ID_CU_ID_SIZE = 4, // size of CU_ID field in bits
- HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register
-
- HW_ID_SE_ID_SIZE = 2, // sizeof SE_ID field in bits
- HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register
-};
-
-// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit
-// immediate and returns a 32 bit value.
-// The encoding of the immediate parameter is:
-// ID 5:0 Which register to read from
-// OFFSET 10:6 Range: 0..31
-// WIDTH 15:11 Range: 1..32
-
-// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width)
-// where hwreg forms a 16 bit immediate encoded by the assembler thus:
-// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
-// return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11);
-// }
-#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11))
-
-// Note: The results can be changed by a context switch
-// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper
-// bound on how many compute units are available. Some values in this
-// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs.
-
-EXTERN uint32_t __kmpc_impl_smid() {
- uint32_t cu_id = __builtin_amdgcn_s_getreg(
- ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
- uint32_t se_id = __builtin_amdgcn_s_getreg(
- ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
- return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
deleted file mode 100644
index 3ea7cdeb5b2bc..0000000000000
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ /dev/null
@@ -1,83 +0,0 @@
-//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Declarations and definitions of target specific functions and constants
-//
-//===----------------------------------------------------------------------===//
-#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
-#define OMPTARGET_AMDGCN_TARGET_IMPL_H
-
-#ifndef __AMDGCN__
-#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
-#endif
-
-#include "amdgcn_interface.h"
-
-#include <stddef.h>
-#include <stdint.h>
-
-// subset of inttypes.h
-#define PRId64 "ld"
-#define PRIu64 "lu"
-
-typedef uint64_t __kmpc_impl_lanemask_t;
-
-#define INLINE inline
-#define NOINLINE __attribute__((noinline))
-#define ALIGN(N) __attribute__((aligned(N)))
-#define PLUGIN_ACCESSIBLE \
- __attribute__((used)) /* Don't discard values the plugin reads */ \
- __attribute__((weak)) /* We may have multiple definitions */ \
- __attribute__((retain)) /* Also needed to keep values alive */ \
- __attribute__((visibility("protected"))) /* Access via SHT_HASH */ \
- __attribute__((section(".data"))) /* Not .bss, can write before load */
-
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-
-INLINE constexpr const llvm::omp::GV &getGridValue() {
- return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernel options
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// The following def must match the absolute limit hardwired in the host RTL
-// max number of threads per team
-enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
-enum { WARPSIZE = getGridValue().GV_Warp_Size };
-
-// Maximum number of omp state objects per SM allocated statically in global
-// memory.
-#define OMP_STATE_COUNT 32
-#define MAX_SM 64
-
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
-// Data sharing related quantities, need to match what is used in the compiler.
-enum DATA_SHARING_SIZES {
- // The size reserved for data in a shared memory slot.
- DS_Slot_Size = getGridValue().GV_Slot_Size,
- // The slot size that should be reserved for a working warp.
- DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
- // The maximum number of warps in use
- DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
-};
-
-enum : __kmpc_impl_lanemask_t {
- __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
-};
-
-// The return code of printf is not checked in the call sites in this library.
-// A call to a function named printf currently hits some special case handling
-// for opencl, which translates to calls that do not presently exist for openmp
-// Therefore, for now, stub out printf while building this library.
-#define printf(...)
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
deleted file mode 100644
index d5161daaced82..0000000000000
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ /dev/null
@@ -1,226 +0,0 @@
-//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-#include "target_interface.h"
-
-// Implementations initially derived from hcc
-
-// Initialized with a 64-bit mask with bits set in positions less than the
-// thread's lane number in the warp
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
- uint32_t lane = GetLaneId();
- int64_t ballot = __kmpc_impl_activemask();
- uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
- return mask & ballot;
-}
-
-// Initialized with a 64-bit mask with bits set in positions greater than the
-// thread's lane number in the warp
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
- uint32_t lane = GetLaneId();
- if (lane == (WARPSIZE - 1))
- return 0;
- uint64_t ballot = __kmpc_impl_activemask();
- uint64_t mask = (~((uint64_t)0)) << (lane + 1);
- return mask & ballot;
-}
-
-EXTERN double __kmpc_impl_get_wtick() { return ((double)1E-9); }
-
-EXTERN double __kmpc_impl_get_wtime() {
- // The intrinsics for measuring time have undocumented frequency
- // This will probably need to be found by measurement on a number of
- // architectures. Until then, return 0, which is very inaccurate as a
- // timer but resolves the undefined symbol at link time.
- return 0;
-}
-
-// Warp vote function
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
- return __builtin_amdgcn_read_exec();
-}
-
-static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {
- __atomic_thread_fence(__ATOMIC_ACQUIRE);
-
- uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;
-
- // Partial barrier implementation for amdgcn.
- // Uses two 16 bit unsigned counters. One for the number of waves to have
- // reached the barrier, and one to count how many times the barrier has been
- // passed. These are packed in a single atomically accessed 32 bit integer.
- // Low bits for the number of waves, assumed zero before this call.
- // High bits to count the number of times the barrier has been passed.
-
- // precondition: num_waves != 0;
- // invariant: num_waves * WARPSIZE == num_threads;
- // precondition: num_waves < 0xffffu;
-
- // Increment the low 16 bits once, using the lowest active thread.
- uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
- bool isLowest = GetLaneId() == lowestActiveThread;
-
- if (isLowest) {
- uint32_t load = __atomic_fetch_add(barrier_state, 1,
- __ATOMIC_RELAXED); // commutative
-
- // Record the number of times the barrier has been passed
- uint32_t generation = load & 0xffff0000u;
-
- if ((load & 0x0000ffffu) == (num_waves - 1)) {
- // Reached num_waves in low bits so this is the last wave.
- // Set low bits to zero and increment high bits
- load += 0x00010000u; // wrap is safe
- load &= 0xffff0000u; // because bits zeroed second
-
- // Reset the wave counter and release the waiting waves
- __atomic_store_n(barrier_state, load, __ATOMIC_RELAXED);
- } else {
- // more waves still to go, spin until generation counter changes
- do {
- __builtin_amdgcn_s_sleep(0);
- load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED);
- } while ((load & 0xffff0000u) == generation);
- }
- }
- __atomic_thread_fence(__ATOMIC_RELEASE);
-}
-
-uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]];
-#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc)
-
-EXTERN void __kmpc_impl_target_init() {
- // Don't have global ctors, and shared memory is not zero init
- __atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE);
-}
-
-EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
- pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier);
-}
-
-namespace {
-uint32_t get_grid_dim(uint32_t n, uint16_t d) {
- uint32_t q = n / d;
- return q + (n > q * d);
-}
-uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
- uint16_t group_size) {
- uint32_t r = grid_size - group_id * group_size;
- return (r < group_size) ? r : group_size;
-}
-} // namespace
-
-EXTERN int __kmpc_get_hardware_num_blocks() {
- return get_grid_dim(__builtin_amdgcn_grid_size_x(),
- __builtin_amdgcn_workgroup_size_x());
-}
-
-EXTERN int __kmpc_get_hardware_num_threads_in_block() {
- return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
- __builtin_amdgcn_grid_size_x(),
- __builtin_amdgcn_workgroup_size_x());
-}
-
-EXTERN unsigned __kmpc_get_warp_size() {
- return WARPSIZE;
-}
-
-EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
-EXTERN unsigned GetLaneId() {
- return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
-}
-
-EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
- return __kmpc_get_hardware_num_threads_in_block();
-}
-
-// Atomics
-uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
- return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
-}
-uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
- return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, "");
-}
-uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
- return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
-}
-
-uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
- uint32_t R;
- __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
- return R;
-}
-uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
- (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
- __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
- return Compare;
-}
-
-unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
- unsigned long long Val) {
- unsigned long long R;
- __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
- return R;
-}
-unsigned long long __kmpc_atomic_add(unsigned long long *Address,
- unsigned long long Val) {
- return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
-}
-
-// Stub implementations
-// Weak to allow overriding by local versions while comparing
diff erent
-// potential implementations
-__attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) {
- return nullptr;
-}
-__attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}
-
-EXTERN
-int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) {
- return -1;
-}
-
-EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
- lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
- hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
-}
-
-EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
- return (((uint64_t)hi) << 32) | (uint64_t)lo;
-}
-
-EXTERN void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
-
-EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
- // AMDGCN doesn't need to sync threads in a warp
-}
-
-EXTERN void __kmpc_impl_threadfence() {
- __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
-}
-
-EXTERN void __kmpc_impl_threadfence_block() {
- __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
-}
-
-EXTERN void __kmpc_impl_threadfence_system() {
- __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
-}
-
-// Calls to the AMDGCN layer (assuming 1D layout)
-EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); }
-EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/allocator.h b/openmp/libomptarget/deviceRTLs/common/allocator.h
deleted file mode 100644
index bd1d18e14ae9b..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/allocator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Macros for allocating variables in
diff erent address spaces.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_ALLOCATOR_H
-#define OMPTARGET_ALLOCATOR_H
-
-#if _OPENMP
-// Follows the pattern in interface.h
-// Clang sema checks this type carefully, needs to closely match that from omp.h
-typedef enum omp_allocator_handle_t {
- omp_null_allocator = 0,
- omp_default_mem_alloc = 1,
- omp_large_cap_mem_alloc = 2,
- omp_const_mem_alloc = 3,
- omp_high_bw_mem_alloc = 4,
- omp_low_lat_mem_alloc = 5,
- omp_cgroup_mem_alloc = 6,
- omp_pteam_mem_alloc = 7,
- omp_thread_mem_alloc = 8,
- KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
-} omp_allocator_handle_t;
-
-#define __PRAGMA(STR) _Pragma(#STR)
-#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
-
-#define SHARED(NAME) \
- NAME [[clang::loader_uninitialized]]; \
- OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
-
-#define EXTERN_SHARED(NAME) \
- NAME; \
- OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
-#endif
-
-#endif // OMPTARGET_ALLOCATOR_H
diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h
deleted file mode 100644
index 4ca1e5563fb36..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/debug.h
+++ /dev/null
@@ -1,293 +0,0 @@
-//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains debug macros to be used in the application.
-//
-// Usage guide
-//
-// PRINT0(flag, str) : if debug flag is on, print (no arguments)
-// PRINT(flag, str, args) : if debug flag is on, print (arguments)
-// DON(flag) : return true if debug flag is on
-//
-// ASSERT(flag, cond, str, args): if test flag is on, test the condition
-// if the condition is false, print str+args
-// and assert.
-// CAUTION: cond may be evaluate twice
-// AON(flag) : return true if test flag is on
-//
-// WARNING(flag, str, args) : if warning flag is on, print the warning
-// WON(flag) : return true if warning flag is on
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_NVPTX_DEBUG_H_
-#define _OMPTARGET_NVPTX_DEBUG_H_
-
-#include "target_interface.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// set desired level of debugging
-////////////////////////////////////////////////////////////////////////////////
-
-#define LD_SET_NONE 0ULL /* none */
-#define LD_SET_ALL -1ULL /* all */
-
-// pos 1
-#define LD_SET_LOOP 0x1ULL /* basic loop */
-#define LD_SET_LOOPD 0x2ULL /* basic loop */
-#define LD_SET_PAR 0x4ULL /* basic parallel */
-#define LD_SET_PARD 0x8ULL /* basic parallel */
-
-// pos 2
-#define LD_SET_SYNC 0x10ULL /* sync info */
-#define LD_SET_SYNCD 0x20ULL /* sync info */
-#define LD_SET_WAIT 0x40ULL /* state when waiting */
-#define LD_SET_TASK 0x80ULL /* print task info (high level) */
-
-// pos 3
-#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */
-#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */
-#define LD_SET_ENV 0x400ULL /* env info */
-#define LD_SET_CANCEL 0x800ULL /* print cancel info */
-
-// pos 4
-#define LD_SET_MEM 0x1000ULL /* malloc / free */
-
-////////////////////////////////////////////////////////////////////////////////
-// set the desired flags to print selected output.
-
-// these are some examples of possible definitions that can be used for
-// debugging.
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
-// on cuda buffer
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
-//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
-
-#ifndef OMPTARGET_NVPTX_DEBUG
-#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
-#elif OMPTARGET_NVPTX_DEBUG
-#warning debug is used, not good for measurements
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// set desired level of asserts
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// available flags
-
-#define LT_SET_NONE 0x0 /* unsafe */
-#define LT_SET_SAFETY \
- 0x1 /* check malloc type of stuff, input at creation, cheap */
-#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
-#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
-
-////////////////////////////////////////////////////////////////////////////////
-// set the desired flags
-
-#ifndef OMPTARGET_NVPTX_TEST
-#if OMPTARGET_NVPTX_DEBUG
-#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
-#else
-#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
-#endif
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// set desired level of warnings
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// available flags
-
-#define LW_SET_ALL -1
-#define LW_SET_NONE 0x0
-#define LW_SET_ENV 0x1
-#define LW_SET_INPUT 0x2
-#define LW_SET_FUSSY 0x4
-
-////////////////////////////////////////////////////////////////////////////////
-// set the desired flags
-
-#if OMPTARGET_NVPTX_DEBUG
-#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
-#else
-#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// implementation for debug
-////////////////////////////////////////////////////////////////////////////////
-
-#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
-#include "common/support.h"
-
-template <typename... Arguments>
-NOINLINE static void log(const char *fmt, Arguments... parameters) {
- printf(fmt, (int)GetBlockIdInKernel(),
- (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
- (int)GetLaneId(), parameters...);
-}
-
-#endif
-#if OMPTARGET_NVPTX_TEST
-
-template <typename... Arguments>
-NOINLINE static void check(bool cond, const char *fmt,
- Arguments... parameters) {
- if (!cond) {
- printf(fmt, (int)GetBlockIdInKernel(),
- (int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
- (int)GetLaneId(), parameters...);
- __builtin_trap();
- }
-}
-
-NOINLINE static void check(bool cond) {
- if (!cond)
- __builtin_trap();
-}
-#endif
-
-// set flags that are tested (inclusion properties)
-
-#define LD_ALL (LD_SET_ALL)
-
-#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
-#define LD_LOOPD (LD_SET_LOOPD)
-#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
-#define LD_PARD (LD_SET_PARD)
-
-// pos 2
-#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
-#define LD_SYNCD (LD_SET_SYNCD)
-#define LD_WAIT (LD_SET_WAIT)
-#define LD_TASK (LD_SET_TASK)
-
-// pos 3
-#define LD_IO (LD_SET_IO | LD_SET_IOD)
-#define LD_IOD (LD_SET_IOD)
-#define LD_ENV (LD_SET_ENV)
-#define LD_CANCEL (LD_SET_CANCEL)
-
-// pos 3
-#define LD_MEM (LD_SET_MEM)
-
-// implement
-#if OMPTARGET_NVPTX_DEBUG
-
-#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag))
-
-#define PRINT0(_flag, _str) \
- { \
- if (omptarget_device_environment.debug_level && DON(_flag)) { \
- log("<b %2d, t %4d, w %2d, l %2d>: " _str); \
- } \
- }
-
-#define PRINT(_flag, _str, _args...) \
- { \
- if (omptarget_device_environment.debug_level && DON(_flag)) { \
- log("<b %2d, t %4d, w %2d, l %2d>: " _str, _args); \
- } \
- }
-#else
-
-#define DON(_flag) (0)
-#define PRINT0(flag, str)
-#define PRINT(flag, str, _args...)
-
-#endif
-
-// for printing without worrying about precision, pointers...
-#define P64(_x) ((unsigned long long)(_x))
-
-////////////////////////////////////////////////////////////////////////////////
-// early defs for test
-////////////////////////////////////////////////////////////////////////////////
-
-#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
-#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
-#define LT_FUSSY (LT_SET_FUSSY)
-
-#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
-
-#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
-#define ASSERT0(_flag, _cond, _str) \
- { \
- if (TON(_flag)) { \
- check(_cond); \
- } \
- }
-#define ASSERT(_flag, _cond, _str, _args...) \
- { \
- if (TON(_flag)) { \
- check(_cond); \
- } \
- }
-
-#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
-
-#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
-#define ASSERT0(_flag, _cond, _str) \
- { \
- if (TON(_flag)) { \
- check((_cond), "<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n"); \
- } \
- }
-#define ASSERT(_flag, _cond, _str, _args...) \
- { \
- if (TON(_flag)) { \
- check((_cond), "<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", \
- _args); \
- } \
- }
-
-#else
-
-#define TON(_flag) (0)
-#define ASSERT0(_flag, _cond, _str)
-#define ASSERT(_flag, _cond, _str, _args...)
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// early defs for warning
-
-#define LW_ALL (LW_SET_ALL)
-#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
-#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
-#define LW_FUSSY (LW_SET_FUSSY)
-
-#if OMPTARGET_NVPTX_WARNING
-
-#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
-#define WARNING0(_flag, _str) \
- { \
- if (WON(_flag)) { \
- log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str); \
- } \
- }
-#define WARNING(_flag, _str, _args...) \
- { \
- if (WON(_flag)) { \
- log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, _args); \
- } \
- }
-
-#else
-
-#define WON(_flag) (0)
-#define WARNING0(_flag, _str)
-#define WARNING(_flag, _str, _args...)
-
-#endif
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen b/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen
deleted file mode 100644
index 9c4e03576c224..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/generated_microtask_cases.gen
+++ /dev/null
@@ -1,405 +0,0 @@
-case 0:
-((void (*)(kmp_int32 *, kmp_int32 *
-))fn)(&global_tid, &bound_tid
-);
-break;
-case 1:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *))fn)(&global_tid, &bound_tid
-, args[0]);
-break;
-case 2:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1]);
-break;
-case 3:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2]);
-break;
-case 4:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-);
-break;
-case 5:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4]);
-break;
-case 6:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5]);
-break;
-case 7:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6]);
-break;
-case 8:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-);
-break;
-case 9:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8]);
-break;
-case 10:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9]);
-break;
-case 11:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10]);
-break;
-case 12:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-);
-break;
-case 13:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12]);
-break;
-case 14:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13]);
-break;
-case 15:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14]);
-break;
-case 16:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-);
-break;
-case 17:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16]);
-break;
-case 18:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17]);
-break;
-case 19:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18]);
-break;
-case 20:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-);
-break;
-case 21:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20]);
-break;
-case 22:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21]);
-break;
-case 23:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22]);
-break;
-case 24:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-);
-break;
-case 25:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24]);
-break;
-case 26:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25]);
-break;
-case 27:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26]);
-break;
-case 28:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-);
-break;
-case 29:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-, args[28]);
-break;
-case 30:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-, args[28], args[29]);
-break;
-case 31:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-, args[28], args[29], args[30]);
-break;
-case 32:
-((void (*)(kmp_int32 *, kmp_int32 *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-, void *, void *, void *, void *
-))fn)(&global_tid, &bound_tid
-, args[0], args[1], args[2], args[3]
-, args[4], args[5], args[6], args[7]
-, args[8], args[9], args[10], args[11]
-, args[12], args[13], args[14], args[15]
-, args[16], args[17], args[18], args[19]
-, args[20], args[21], args[22], args[23]
-, args[24], args[25], args[26], args[27]
-, args[28], args[29], args[30], args[31]
-);
-break;
\ No newline at end of file
diff --git a/openmp/libomptarget/deviceRTLs/common/include/target.h b/openmp/libomptarget/deviceRTLs/common/include/target.h
deleted file mode 100644
index fbb72f9900779..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/include/target.h
+++ /dev/null
@@ -1,94 +0,0 @@
-//===-- target.h ---------- OpenMP device runtime target implementation ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Target region interfaces are simple interfaces designed to allow middle-end
-// (=LLVM) passes to analyze and transform the code. To achieve good performance
-// it may be required to run the associated passes. However, implementations of
-// this interface shall always provide a correct implementation as close to the
-// user expected code as possible.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
-#define LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
-
-#include <stdint.h>
-
-extern "C" {
-
-/// Forward declaration of the source location identifier "ident".
-typedef struct ident ident_t;
-
-/// The target region _kernel_ interface for GPUs
-///
-/// This deliberatly simple interface provides the middle-end (=LLVM) with
-/// easier means to reason about the semantic of the code and transform it as
-/// well. The runtime calls are therefore also desiged to carry sufficient
-/// information necessary for optimizations.
-///
-///
-/// Intended usage:
-///
-/// \code
-/// void kernel(...) {
-/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1,
-/// /* UseGenericStateMachine */ true,
-/// /* RequiresFullRuntime */ ... );
-/// if (ThreadKind == -1) {
-/// // User defined kernel code.
-/// }
-/// __kmpc_target_deinit(...);
-/// }
-/// \endcode
-///
-/// Which can be transformed to:
-///
-/// \code
-/// void kernel(...) {
-/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1,
-/// /* UseGenericStateMachine */ false,
-/// /* RequiresFullRuntime */ ... );
-/// if (ThreadKind == -1) {
-/// // User defined kernel code.
-/// } else {
-/// assume(ThreadKind == ThreadId);
-/// // Custom, kernel-specific state machine code.
-/// }
-/// __kmpc_target_deinit(...);
-/// }
-/// \endcode
-///
-///
-///{
-
-/// Initialization
-///
-/// Must be called by all threads.
-///
-/// \param Ident Source location identification, can be NULL.
-///
-int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
- bool UseGenericStateMachine,
- bool RequiresFullRuntime);
-
-/// De-Initialization
-///
-/// Must be called by the main thread in generic mode, can be called by all
-/// threads. Must be called by all threads in SPMD mode.
-///
-/// In non-SPMD, this function releases the workers trapped in a state machine
-/// and also any memory dynamically allocated by the runtime.
-///
-/// \param Ident Source location identification, can be NULL.
-///
-void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
- bool RequiresFullRuntime);
-
-///}
-}
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h b/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
deleted file mode 100644
index ed8d97a7480bf..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/include/target/shuffle.h
+++ /dev/null
@@ -1,102 +0,0 @@
-//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Shuffle function implementations for all supported targets.
-//
-// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
-#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H
-
-#include <stdint.h>
-
-#pragma omp declare target
-
-/// External shuffle API
-///
-///{
-
-extern "C" {
-int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
-int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
-}
-
-///}
-
-/// Forward declarations
-///
-///{
-extern "C" {
-unsigned GetLaneId();
-unsigned __kmpc_get_warp_size();
-void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
-uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
-}
-///}
-
-/// Fallback implementations of the shuffle sync idiom.
-/// Unavailable at present (would error at link time if used).
-///
-///{
-
-int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, int32_t SrcLane);
-
-int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, uint32_t Delta,
- int32_t Width);
-
-///}
-
-/// AMDGCN implementations of the shuffle sync idiom.
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
- int32_t SrcLane) {
- int Width = __kmpc_get_warp_size();
- int Self = GetLaneId();
- int Index = SrcLane + (Self & ~(Width - 1));
- return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
- uint32_t LaneDelta, int32_t Width) {
- int Self = GetLaneId();
- int Index = Self + LaneDelta;
- Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
- return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
-}
-
-#pragma omp end declare variant
-///}
-
-/// NVPTX implementations of the shuffle and shuffle sync idiom.
-///
-///{
-#pragma omp begin declare variant match( \
- device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
-
-inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
- int32_t SrcLane) {
- return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-}
-
-inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
- uint32_t Delta, int32_t Width) {
- int32_t T = ((__kmpc_get_warp_size() - Width) << 8) | 0x1f;
- return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-}
-
-#pragma omp end declare variant
-///}
-
-#pragma omp end declare target
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
deleted file mode 100644
index 417c22d607d5e..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ /dev/null
@@ -1,282 +0,0 @@
-//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of all library macros, types,
-// and functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_H
-#define OMPTARGET_H
-
-#include "common/allocator.h"
-#include "common/debug.h" // debug
-#include "common/state-queue.h"
-#include "common/support.h"
-#include "interface.h" // interfaces with omp, compiler, and user
-#include "target_impl.h"
-
-#define OMPTARGET_NVPTX_VERSION 1.1
-
-// used by the library for the interface with the app
-#define DISPATCH_FINISHED 0
-#define DISPATCH_NOTFINISHED 1
-
-// used by dynamic scheduling
-#define FINISHED 0
-#define NOT_FINISHED 1
-#define LAST_CHUNK 2
-
-#define BARRIER_COUNTER 0
-#define ORDERED_COUNTER 1
-
-// Worker slot type which is initialized with the default worker slot
-// size of 4*32 bytes.
-struct __kmpc_data_sharing_slot {
- __kmpc_data_sharing_slot *Next;
- __kmpc_data_sharing_slot *Prev;
- void *PrevSlotStackPtr;
- void *DataEnd;
- char Data[DS_Worker_Warp_Slot_Size];
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// task ICV and (implicit & explicit) task state
-
-class omptarget_nvptx_TaskDescr {
-public:
- // methods for flags
- INLINE omp_sched_t GetRuntimeSched() const;
- INLINE void SetRuntimeSched(omp_sched_t sched);
- INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
- INLINE int InL2OrHigherParallelRegion() const {
- return items.flags & TaskDescr_InParL2P;
- }
- INLINE int IsParallelConstruct() const {
- return items.flags & TaskDescr_IsParConstr;
- }
- INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
- // methods for other fields
- INLINE uint16_t &ThreadId() { return items.threadId; }
- INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
- INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
- INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
- prev = taskDescr;
- }
- // init & copy
- INLINE void InitLevelZeroTaskDescr();
- INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
- INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
- INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
- INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
- INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
- INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
- INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
- INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
- uint16_t tid, uint16_t tnum);
- INLINE void SaveLoopData();
- INLINE void RestoreLoopData() const;
-
-private:
- // bits for flags: (6 used, 2 free)
- // 3 bits (SchedMask) for runtime schedule
- // 1 bit (InPar) if this thread has encountered one or more parallel region
- // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
- // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel
- // region
- static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
- static const uint8_t TaskDescr_InPar = 0x10;
- static const uint8_t TaskDescr_IsParConstr = 0x20;
- static const uint8_t TaskDescr_InParL2P = 0x40;
-
- struct SavedLoopDescr_items {
- int64_t loopUpperBound;
- int64_t nextLowerBound;
- int64_t chunk;
- int64_t stride;
- kmp_sched_t schedule;
- } loopData;
-
- struct TaskDescr_items {
- uint8_t flags; // 6 bit used (see flag above)
- uint8_t unused;
- uint16_t threadId; // thread id
- uint64_t runtimeChunkSize; // runtime chunk size
- } items;
- omptarget_nvptx_TaskDescr *prev;
-};
-
-// build on kmp
-typedef struct omptarget_nvptx_ExplicitTaskDescr {
- omptarget_nvptx_TaskDescr
- taskDescr; // omptarget_nvptx task description (must be first)
- kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
-} omptarget_nvptx_ExplicitTaskDescr;
-
-////////////////////////////////////////////////////////////////////////////////
-// Descriptor of a parallel region (worksharing in general)
-
-class omptarget_nvptx_WorkDescr {
-
-public:
- // access to data
- INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
-
-private:
- omptarget_nvptx_TaskDescr masterTaskICV;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-class omptarget_nvptx_TeamDescr {
-public:
- // access to data
- INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
- return &levelZeroTaskDescr;
- }
- INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
- return workDescrForActiveParallel;
- }
-
- // init
- INLINE void InitTeamDescr();
-
- INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
- worker_rootS[wid].DataEnd =
- &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
- // We currently do not have a next slot.
- worker_rootS[wid].Next = 0;
- worker_rootS[wid].Prev = 0;
- worker_rootS[wid].PrevSlotStackPtr = 0;
- return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
- }
-
-private:
- omptarget_nvptx_TaskDescr
- levelZeroTaskDescr; // icv for team master initial thread
- omptarget_nvptx_WorkDescr
- workDescrForActiveParallel; // one, ONLY for the active par
-
- ALIGN(16)
- __kmpc_data_sharing_slot worker_rootS[DS_Max_Warp_Number];
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// thread private data (struct of arrays for better coalescing)
-// tid refers here to the global thread id
-// do not support multiple concurrent kernel a this time
-class omptarget_nvptx_ThreadPrivateContext {
-public:
- // task
- INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
- return &levelOneTaskDescr[tid];
- }
- INLINE void SetTopLevelTaskDescr(int tid,
- omptarget_nvptx_TaskDescr *taskICV) {
- topTaskDescr[tid] = taskICV;
- }
- INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
- // schedule (for dispatch)
- INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
- INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
- INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
- INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
- INLINE int64_t &Stride(int tid) { return stride[tid]; }
-
- INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
-
- INLINE void InitThreadPrivateContext(int tid);
- INLINE uint64_t &Cnt() { return cnt; }
-
-private:
- // team context for this team
- omptarget_nvptx_TeamDescr teamContext;
- // task ICV for implicit threads in the only parallel region
- omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
- // pointer where to find the current task ICV (top of the stack)
- omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
- // schedule (for dispatch)
- kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
- int64_t chunk[MAX_THREADS_PER_TEAM];
- int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
- // state for dispatch with dyn/guided OR static (never use both at a time)
- int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
- int64_t stride[MAX_THREADS_PER_TEAM];
- uint64_t cnt;
-};
-
-/// Memory manager for statically allocated memory.
-class omptarget_nvptx_SimpleMemoryManager {
-private:
- struct MemDataTy {
- volatile unsigned keys[OMP_STATE_COUNT];
- } MemData[MAX_SM] ALIGN(128);
-
- INLINE static uint32_t hash(unsigned key) {
- return key & (OMP_STATE_COUNT - 1);
- }
-
-public:
- INLINE void Release();
- INLINE const void *Acquire(const void *buf, size_t size);
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// global data tables
-////////////////////////////////////////////////////////////////////////////////
-
-extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
-extern uint32_t EXTERN_SHARED(usedMemIdx);
-extern uint32_t EXTERN_SHARED(usedSlotIdx);
-#if _OPENMP
-extern uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
-#else
-extern uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
-#endif
-extern uint16_t EXTERN_SHARED(threadLimit);
-extern uint16_t EXTERN_SHARED(threadsInTeam);
-extern uint16_t EXTERN_SHARED(nThreads);
-extern omptarget_nvptx_ThreadPrivateContext *
- EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
-
-extern int8_t EXTERN_SHARED(execution_param);
-extern void *EXTERN_SHARED(ReductionScratchpadPtr);
-
-////////////////////////////////////////////////////////////////////////////////
-// work function (outlined parallel/simd functions) and arguments.
-// needed for L1 parallelism only.
-////////////////////////////////////////////////////////////////////////////////
-
-typedef void *omptarget_nvptx_WorkFn;
-extern omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
-
-////////////////////////////////////////////////////////////////////////////////
-// get private data structures
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
-INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
-INLINE omptarget_nvptx_TaskDescr *
-getMyTopTaskDescriptor(bool isSPMDExecutionMode);
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
-
-////////////////////////////////////////////////////////////////////////////////
-// inlined implementation
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
-INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
-INLINE uint32_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
-INLINE uint32_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
-
-#include "common/omptargeti.h"
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
deleted file mode 100644
index 93831c8952739..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ /dev/null
@@ -1,223 +0,0 @@
-//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of all library macros, types,
-// and functions.
-//
-//===----------------------------------------------------------------------===//
-
-////////////////////////////////////////////////////////////////////////////////
-// Task Descriptor
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
- // sched starts from 1..4; encode it as 0..3; so add 1 here
- uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
- return (omp_sched_t)rc;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
- // sched starts from 1..4; encode it as 0..3; so sub 1 here
- uint8_t val = ((uint8_t)sched) - 1;
- // clear current sched
- items.flags &= ~TaskDescr_SchedMask;
- // set new sched
- items.flags |= val;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
- // slow method
- // flag:
- // default sched is static,
- // dyn is off (unused now anyway, but may need to sample from host ?)
- // not in parallel
-
- items.flags = 0;
- items.threadId = 0; // is master
- items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
-}
-
-// This is called when all threads are started together in SPMD mode.
-// OMP directives include target parallel, target distribute parallel for, etc.
-INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
- omptarget_nvptx_TaskDescr *parentTaskDescr) {
- // slow method
- // flag:
- // default sched is static,
- // dyn is off (unused now anyway, but may need to sample from host ?)
- // in L1 parallel
-
- items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
- items.threadId =
- __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
- // called for 1st level)
- items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
- prev = parentTaskDescr;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyData(
- omptarget_nvptx_TaskDescr *sourceTaskDescr) {
- items = sourceTaskDescr->items;
-}
-
-INLINE void
-omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
- CopyData(sourceTaskDescr);
- prev = sourceTaskDescr->prev;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyParent(
- omptarget_nvptx_TaskDescr *parentTaskDescr) {
- CopyData(parentTaskDescr);
- prev = parentTaskDescr;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
- omptarget_nvptx_TaskDescr *parentTaskDescr) {
- CopyParent(parentTaskDescr);
- items.flags = items.flags & ~TaskDescr_IsParConstr;
- ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
- omptarget_nvptx_TaskDescr *masterTaskDescr) {
- CopyParent(masterTaskDescr);
- // overwrite specific items;
- items.flags |=
- TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
- omptarget_nvptx_TaskDescr *workTaskDescr) {
- Copy(workTaskDescr);
- //
- // overwrite specific items;
- //
- // The threadID should be __kmpc_get_hardware_thread_id_in_block() %
- // GetMasterThreadID(). This is so that the serial master (first lane in the
- // master warp) gets a threadId of 0. However, we know that this function is
- // always called in a parallel region where only workers are active. The
- // serial master thread never enters this region. When a parallel region is
- // executed serially, the threadId is set to 0 elsewhere and the
- // kmpc_serialized_* functions are called, which never activate this region.
- items.threadId =
- __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
- // called for 1st level)
-}
-
-INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
- omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
- CopyParent(parentTaskDescr);
- items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
- items.threadId = tid;
-}
-
-INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
- loopData.loopUpperBound =
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
- loopData.nextLowerBound =
- omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
- loopData.schedule =
- omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
- loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
- loopData.stride =
- omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
-}
-
-INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
- omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
- loopData.loopUpperBound;
- omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
- loopData.nextLowerBound;
- omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
- loopData.stride;
- omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
- loopData.schedule;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Thread Private Context
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE omptarget_nvptx_TaskDescr *
-omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
- ASSERT0(
- LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
- "Getting top level, tid is larger than allocated data structure size");
- return topTaskDescr[tid];
-}
-
-INLINE void
-omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
- // levelOneTaskDescr is init when starting the parallel region
- // top task descr is NULL (team master version will be fixed separately)
- topTaskDescr[tid] = NULL;
- // the following don't need to be init here; they are init when using dyn
- // sched
- // current_Event, events_Number, chunk, num_Iterations, schedule
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Team Descriptor
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
- levelZeroTaskDescr.InitLevelZeroTaskDescr();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Get private data structure for thread
-////////////////////////////////////////////////////////////////////////////////
-
-// Utility routines for CUDA threads
-INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
- return omptarget_nvptx_threadPrivateContext->TeamContext();
-}
-
-INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
- omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
- return currTeamDescr.WorkDescr();
-}
-
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
- return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
-}
-
-INLINE omptarget_nvptx_TaskDescr *
-getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
- return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory management runtime functions.
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
- ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
- "SlotIdx is too big or uninitialized.");
- ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
- "MemIdx is too big or uninitialized.");
- MemDataTy &MD = MemData[usedSlotIdx];
- __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
-}
-
-INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
- size_t size) {
- ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
- "SlotIdx is too big or uninitialized.");
- const unsigned sm = usedSlotIdx;
- MemDataTy &MD = MemData[sm];
- unsigned i = hash(GetBlockIdInKernel());
- while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
- i = hash(i + 1);
- }
- usedSlotIdx = sm;
- usedMemIdx = i;
- return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
-}
diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
deleted file mode 100644
index e4df954029dc7..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Interface to be used in the implementation of OpenMP cancel.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-#include "interface.h"
-
-EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
- int32_t cancelVal) {
- PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
- // disabled
- return 0;
-}
-
-EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
- int32_t cancelVal) {
- PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
- // disabled
- return 0;
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu
deleted file mode 100644
index 48692aa581f23..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of critical with KMPC interface
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-#include "interface.h"
-
-EXTERN
-void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
- kmp_CriticalName *lck) {
- PRINT0(LD_IO, "call to kmpc_critical()\n");
- omp_set_lock((omp_lock_t *)lck);
-}
-
-EXTERN
-void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
- kmp_CriticalName *lck) {
- PRINT0(LD_IO, "call to kmpc_end_critical()\n");
- omp_unset_lock((omp_lock_t *)lck);
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
deleted file mode 100644
index f834a7a8e172b..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ /dev/null
@@ -1,194 +0,0 @@
-//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of data sharing environments
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target/shuffle.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Runtime functions for trunk data sharing scheme.
-////////////////////////////////////////////////////////////////////////////////
-
-static constexpr unsigned MinBytes = 8;
-
-static constexpr unsigned Alignment = 8;
-
-/// External symbol to access dynamic shared memory.
-extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
-#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
-
-EXTERN void *__kmpc_get_dynamic_shared() { return DynamicSharedBuffer; }
-
-EXTERN void *llvm_omp_get_dynamic_shared() {
- return __kmpc_get_dynamic_shared();
-}
-
-template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
-struct alignas(32) ThreadStackTy {
- static constexpr unsigned BytesPerThread = BPerThread;
- static constexpr unsigned NumThreads = NThreads;
- static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
-
- unsigned char Data[NumThreads][BytesPerThread];
- unsigned char Usage[NumThreads];
-};
-
-[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
-#pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)
-
-[[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
- MAX_THREADS_PER_TEAM / 4>
- WorkerSharedStack;
-#pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)
-
-EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
- size_t AlignedBytes = Bytes + (Bytes % MinBytes);
- int TID = __kmpc_get_hardware_thread_id_in_block();
- if (__kmpc_is_generic_main_thread(TID)) {
- // Main thread alone, use shared memory if space is available.
- if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
- void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
- MainSharedStack.Usage[0] += AlignedBytes;
- return Ptr;
- }
- } else if (TID < WorkerSharedStack.NumThreads) {
- if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
- void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
- WorkerSharedStack.Usage[TID] += AlignedBytes;
- return Ptr;
- }
- }
- // Fallback to malloc
- return SafeMalloc(Bytes, "AllocGlobalFallback");
-}
-
-EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
- size_t AlignedBytes = Bytes + (Bytes % MinBytes);
- int TID = __kmpc_get_hardware_thread_id_in_block();
- if (__kmpc_is_generic_main_thread(TID)) {
- if (Ptr >= &MainSharedStack.Data[0][0] &&
- Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
- MainSharedStack.Usage[0] -= AlignedBytes;
- return;
- }
- } else if (TID < WorkerSharedStack.NumThreads) {
- if (Ptr >= &WorkerSharedStack.Data[0][0] &&
- Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
- int TID = __kmpc_get_hardware_thread_id_in_block();
- WorkerSharedStack.Usage[TID] -= AlignedBytes;
- return;
- }
- }
- SafeFree(Ptr, "FreeGlobalFallback");
-}
-
-EXTERN void __kmpc_data_sharing_init_stack() {
- for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
- MainSharedStack.Usage[i] = 0;
- for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
- WorkerSharedStack.Usage[i] = 0;
-}
-
-/// Allocate storage in shared memory to communicate arguments from the main
-/// thread to the workers in generic mode. If we exceed
-/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
-#define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64
-
-[[clang::loader_uninitialized]] static void
- *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
-#pragma omp allocate(SharedMemVariableSharingSpace) \
- allocator(omp_pteam_mem_alloc)
-[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
-#pragma omp allocate(SharedMemVariableSharingSpacePtr) \
- allocator(omp_pteam_mem_alloc)
-
-// Begin a data sharing context. Maintain a list of references to shared
-// variables. This list of references to shared variables will be passed
-// to one or more threads.
-// In L0 data sharing this is called by master thread.
-// In L1 data sharing this is called by active warp master thread.
-EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
- if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
- SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
- } else {
- SharedMemVariableSharingSpacePtr =
- (void **)SafeMalloc(nArgs * sizeof(void *), "new extended args");
- }
- *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-
-// End a data sharing context. There is no need to have a list of refs
-// to shared variables because the context in which those variables were
-// shared has now ended. This should clean-up the list of references only
-// without affecting the actual global storage of the variables.
-// In L0 data sharing this is called by master thread.
-// In L1 data sharing this is called by active warp master thread.
-EXTERN void __kmpc_end_sharing_variables() {
- if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
- SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
-}
-
-// This function will return a list of references to global variables. This
-// is how the workers will get a reference to the globalized variable. The
-// members of this list will be passed to the outlined parallel function
-// preserving the order.
-// Called by all workers.
-EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
- *GlobalArgs = SharedMemVariableSharingSpacePtr;
-}
-
-// This function is used to init static memory manager. This manager is used to
-// manage statically allocated global memory. This memory is allocated by the
-// compiler and used to correctly implement globalization of the variables in
-// target, teams and distribute regions.
-EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
- const void *buf, size_t size,
- int16_t is_shared,
- const void **frame) {
- if (is_shared) {
- *frame = buf;
- return;
- }
- if (isSPMDExecutionMode) {
- if (__kmpc_get_hardware_thread_id_in_block() == 0) {
- *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
- }
- __kmpc_impl_syncthreads();
- return;
- }
- ASSERT0(LT_FUSSY,
- __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
- "Must be called only in the target master thread.");
- *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
- __kmpc_impl_threadfence();
-}
-
-EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
- int16_t is_shared) {
- if (is_shared)
- return;
- if (isSPMDExecutionMode) {
- __kmpc_impl_syncthreads();
- if (__kmpc_get_hardware_thread_id_in_block() == 0) {
- omptarget_nvptx_simpleMemoryManager.Release();
- }
- return;
- }
- __kmpc_impl_threadfence();
- ASSERT0(LT_FUSSY,
- __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
- "Must be called only in the target master thread.");
- omptarget_nvptx_simpleMemoryManager.Release();
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
deleted file mode 100644
index f1511298a99be..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ /dev/null
@@ -1,359 +0,0 @@
-//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the OpenMP runtime functions that can be
-// invoked by the user in an OpenMP region
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-EXTERN double omp_get_wtick(void) {
- double rc = __kmpc_impl_get_wtick();
- PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);
- return rc;
-}
-
-EXTERN double omp_get_wtime(void) {
- double rc = __kmpc_impl_get_wtime();
- PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
- return rc;
-}
-
-EXTERN void omp_set_num_threads(int num) {
- // Ignore it for SPMD mode.
- if (__kmpc_is_spmd_exec_mode())
- return;
- ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
- PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
- if (num <= 0) {
- WARNING0(LW_INPUT, "expected positive num; ignore\n");
- } else if (parallelLevel[GetWarpId()] == 0) {
- nThreads = num;
- }
-}
-
-EXTERN int omp_get_num_threads(void) {
- int rc = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
- PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
- return rc;
-}
-
-EXTERN int omp_get_max_threads(void) {
- if (parallelLevel[GetWarpId()] > 0)
- // We're already in parallel region.
- return 1; // default is 1 thread avail
- // Not currently in a parallel region, return what was set.
- int rc = 1;
- if (parallelLevel[GetWarpId()] == 0)
- rc = nThreads;
- ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
- PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
- return rc;
-}
-
-EXTERN int omp_get_thread_limit(void) {
- if (__kmpc_is_spmd_exec_mode())
- return __kmpc_get_hardware_num_threads_in_block();
- int rc = threadLimit;
- PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
- return rc;
-}
-
-EXTERN int omp_get_thread_num() {
- int rc = GetOmpThreadId();
- PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
- return rc;
-}
-
-EXTERN int omp_get_num_procs(void) {
- int rc = GetNumberOfProcsInDevice(__kmpc_is_spmd_exec_mode());
- PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
- return rc;
-}
-
-EXTERN int omp_in_parallel(void) {
- int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
- PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
- return rc;
-}
-
-EXTERN int omp_in_final(void) {
- // treat all tasks as final... Specs may expect runtime to keep
- // track more precisely if a task was actively set by users... This
- // is not explicitly specified; will treat as if runtime can
- // actively decide to put a non-final task into a final one.
- int rc = 1;
- PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
- return rc;
-}
-
-EXTERN void omp_set_dynamic(int flag) {
- PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag);
-}
-
-EXTERN int omp_get_dynamic(void) {
- int rc = 0;
- PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
- return rc;
-}
-
-EXTERN void omp_set_nested(int flag) {
- PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
- flag);
-}
-
-EXTERN int omp_get_nested(void) {
- int rc = 0;
- PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
- return rc;
-}
-
-EXTERN void omp_set_max_active_levels(int level) {
- PRINT(LD_IO,
- "call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
- level);
-}
-
-EXTERN int omp_get_max_active_levels(void) {
- int rc = 1;
- PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
- return rc;
-}
-
-EXTERN int omp_get_level(void) {
- int level = __kmpc_parallel_level();
- PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
- return level;
-}
-
-EXTERN int omp_get_active_level(void) {
- int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
- PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
- return level;
-}
-
-EXTERN int omp_get_ancestor_thread_num(int level) {
- if (__kmpc_is_spmd_exec_mode())
- return level == 1 ? __kmpc_get_hardware_thread_id_in_block() : 0;
- int rc = -1;
- // If level is 0 or all parallel regions are not active - return 0.
- unsigned parLevel = parallelLevel[GetWarpId()];
- if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
- int totLevel = omp_get_level();
- if (level <= totLevel) {
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
- int steps = totLevel - level;
- PRINT(LD_IO, "backtrack %d steps\n", steps);
- ASSERT0(LT_FUSSY, currTaskDescr,
- "do not expect fct to be called in a non-active thread");
- do {
- if (DON(LD_IOD)) {
- // print current state
- omp_sched_t sched = currTaskDescr->GetRuntimeSched();
- PRINT(LD_ALL,
- "task descr %s %d: %s, in par %d, rt sched %d,"
- " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
- "ancestor", steps,
- (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
- (int)currTaskDescr->InParallelRegion(), (int)sched,
- currTaskDescr->RuntimeChunkSize(),
- (int)currTaskDescr->ThreadId(), (int)threadsInTeam,
- (int)nThreads);
- }
-
- if (currTaskDescr->IsParallelConstruct()) {
- // found the level
- if (!steps) {
- rc = currTaskDescr->ThreadId();
- break;
- }
- steps--;
- }
- currTaskDescr = currTaskDescr->GetPrevTaskDescr();
- } while (currTaskDescr);
- ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
- }
- } else if (level == 0 ||
- (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
- level <= parLevel) ||
- (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
- level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
- rc = 0;
- }
- PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
- rc)
- return rc;
-}
-
-EXTERN int omp_get_team_size(int level) {
- if (__kmpc_is_spmd_exec_mode())
- return level == 1 ? __kmpc_get_hardware_num_threads_in_block() : 1;
- int rc = -1;
- unsigned parLevel = parallelLevel[GetWarpId()];
- // If level is 0 or all parallel regions are not active - return 1.
- if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
- rc = threadsInTeam;
- } else if (level == 0 ||
- (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
- level <= parLevel) ||
- (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
- level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
- rc = 1;
- }
- PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
- return rc;
-}
-
-EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
- "Expected SPMD mode only with uninitialized runtime.");
- *kind = omp_sched_static;
- *modifier = 1;
- } else {
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode());
- *kind = currTaskDescr->GetRuntimeSched();
- *modifier = currTaskDescr->RuntimeChunkSize();
- }
- PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
- (int)*kind, *modifier);
-}
-
-EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
- PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
- modifier);
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
- "Expected SPMD mode only with uninitialized runtime.");
- return;
- }
- if (kind >= omp_sched_static && kind < omp_sched_auto) {
- omptarget_nvptx_TaskDescr *currTaskDescr =
- getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode());
- currTaskDescr->SetRuntimeSched(kind);
- currTaskDescr->RuntimeChunkSize() = modifier;
- PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",
- (int)currTaskDescr->GetRuntimeSched(),
- currTaskDescr->RuntimeChunkSize());
- }
-}
-
-EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
- PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
- return omp_proc_bind_true;
-}
-
-EXTERN int omp_get_num_places(void) {
- PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
- return 0;
-}
-
-EXTERN int omp_get_place_num_procs(int place_num) {
- PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
- return 0;
-}
-
-EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
- PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
-}
-
-EXTERN int omp_get_place_num(void) {
- PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
- return 0;
-}
-
-EXTERN int omp_get_partition_num_places(void) {
- PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
- return 0;
-}
-
-EXTERN void omp_get_partition_place_nums(int *place_nums) {
- PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
-}
-
-EXTERN int omp_get_cancellation(void) {
- int rc = 0;
- PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
- return rc;
-}
-
-EXTERN void omp_set_default_device(int deviceId) {
- PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
-}
-
-EXTERN int omp_get_default_device(void) {
- PRINT0(LD_IO,
- "call omp_get_default_device() is undef on device, returns 0\n");
- return 0;
-}
-
-EXTERN int omp_get_num_devices(void) {
- PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
- return 0;
-}
-
-EXTERN int omp_get_num_teams(void) {
- int rc = GetNumberOfOmpTeams();
- PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
- return rc;
-}
-
-EXTERN int omp_get_team_num() {
- int rc = GetOmpTeamId();
- PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
- return rc;
-}
-
-// Unspecified on the device.
-EXTERN int omp_get_initial_device(void) {
- PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
- return 0;
-}
-
-// Unused for now.
-EXTERN int omp_get_max_task_priority(void) {
- PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
- return 0;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// locks
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void omp_init_lock(omp_lock_t *lock) {
- __kmpc_impl_init_lock(lock);
- PRINT0(LD_IO, "call omp_init_lock()\n");
-}
-
-EXTERN void omp_destroy_lock(omp_lock_t *lock) {
- __kmpc_impl_destroy_lock(lock);
- PRINT0(LD_IO, "call omp_destroy_lock()\n");
-}
-
-EXTERN void omp_set_lock(omp_lock_t *lock) {
- __kmpc_impl_set_lock(lock);
- PRINT0(LD_IO, "call omp_set_lock()\n");
-}
-
-EXTERN void omp_unset_lock(omp_lock_t *lock) {
- __kmpc_impl_unset_lock(lock);
- PRINT0(LD_IO, "call omp_unset_lock()\n");
-}
-
-EXTERN int omp_test_lock(omp_lock_t *lock) {
- int rc = __kmpc_impl_test_lock(lock);
- PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
- return rc;
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
deleted file mode 100644
index 0d4e2b1e4e0a4..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ /dev/null
@@ -1,813 +0,0 @@
-//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the KMPC interface
-// for the loop construct plus other worksharing constructs that use the same
-// interface as loops.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target/shuffle.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-// template class that encapsulate all the helper functions
-//
-// T is loop iteration type (32 | 64) (unsigned | signed)
-// ST is the signed version of T
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
-public:
- ////////////////////////////////////////////////////////////////////////////////
- // Loop with static scheduling with chunk
-
- // Generic implementation of OMP loop scheduling with static policy
- /*! \brief Calculate initial bounds for static loop and stride
- * @param[in] loc location in code of the call (not used here)
- * @param[in] global_tid global thread id
- * @param[in] schetype type of scheduling (see omptarget-nvptx.h)
- * @param[in] plastiter pointer to last iteration
- * @param[in,out] pointer to loop lower bound. it will contain value of
- * lower bound of first chunk
- * @param[in,out] pointer to loop upper bound. It will contain value of
- * upper bound of first chunk
- * @param[in,out] pointer to loop stride. It will contain value of stride
- * between two successive chunks executed by the same thread
- * @param[in] loop increment bump
- * @param[in] chunk size
- */
-
- // helper function for static chunk
- INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
- ST chunk, T entityId, T numberOfEntities) {
- // each thread executes multiple chunks all of the same size, except
- // the last one
-
- // distance between two successive chunks
- stride = numberOfEntities * chunk;
- lb = lb + entityId * chunk;
- T inputUb = ub;
- ub = lb + chunk - 1; // Clang uses i <= ub
- // Say ub' is the begining of the last chunk. Then who ever has a
- // lower bound plus a multiple of the increment equal to ub' is
- // the last one.
- T beginingLastChunk = inputUb - (inputUb % chunk);
- last = ((beginingLastChunk - lb) % stride) == 0;
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Loop with static scheduling without chunk
-
- // helper function for static no chunk
- INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
- ST &chunk, T entityId,
- T numberOfEntities) {
- // No chunk size specified. Each thread or warp gets at most one
- // chunk; chunks are all almost of equal size
- T loopSize = ub - lb + 1;
-
- chunk = loopSize / numberOfEntities;
- T leftOver = loopSize - chunk * numberOfEntities;
-
- if (entityId < leftOver) {
- chunk++;
- lb = lb + entityId * chunk;
- } else {
- lb = lb + entityId * chunk + leftOver;
- }
-
- T inputUb = ub;
- ub = lb + chunk - 1; // Clang uses i <= ub
- last = lb <= inputUb && inputUb <= ub;
- stride = loopSize; // make sure we only do 1 chunk per warp
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Support for Static Init
-
- INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
- int32_t *plastiter, T *plower, T *pupper,
- ST *pstride, ST chunk,
- bool IsSPMDExecutionMode) {
- // When IsRuntimeUninitialized is true, we assume that the caller is
- // in an L0 parallel region and that all worker threads participate.
-
- // Assume we are in teams region or that we use a single block
- // per target region
- ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
-
- // All warps that are in excess of the maximum requested, do
- // not execute the loop
- PRINT(LD_LOOP,
- "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
- "%d, num tids %d\n",
- (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
- (int)numberOfActiveOMPThreads);
- ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
- "current thread is not needed here; error");
-
- // copy
- int lastiter = 0;
- T lb = *plower;
- T ub = *pupper;
- ST stride = *pstride;
- // init
- switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
- case kmp_sched_static_chunk: {
- if (chunk > 0) {
- ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- break;
- }
- } // note: if chunk <=0, use nochunk
- case kmp_sched_static_balanced_chunk: {
- if (chunk > 0) {
- // round up to make sure the chunk is enough to cover all iterations
- T tripCount = ub - lb + 1; // +1 because ub is inclusive
- T span = (tripCount + numberOfActiveOMPThreads - 1) /
- numberOfActiveOMPThreads;
- // perform chunk adjustment
- chunk = (span + chunk - 1) & ~(chunk - 1);
-
- ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
- T oldUb = ub;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- if (ub > oldUb)
- ub = oldUb;
- break;
- }
- } // note: if chunk <=0, use nochunk
- case kmp_sched_static_nochunk: {
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- break;
- }
- case kmp_sched_distr_static_chunk: {
- if (chunk > 0) {
- ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
- GetNumberOfOmpTeams());
- break;
- } // note: if chunk <=0, use nochunk
- }
- case kmp_sched_distr_static_nochunk: {
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
- GetNumberOfOmpTeams());
- break;
- }
- case kmp_sched_distr_static_chunk_sched_static_chunkone: {
- ForStaticChunk(lastiter, lb, ub, stride, chunk,
- numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
- GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
- break;
- }
- default: {
- ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
- PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
- (int)schedtype);
- ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
- numberOfActiveOMPThreads);
- break;
- }
- }
- // copy back
- *plastiter = lastiter;
- *plower = lb;
- *pupper = ub;
- *pstride = stride;
- PRINT(LD_LOOP,
- "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
- "%d\n",
- (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
- (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
- (int)lastiter);
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Support for dispatch Init
-
- INLINE static int OrderedSchedule(kmp_sched_t schedule) {
- return schedule >= kmp_sched_ordered_first &&
- schedule <= kmp_sched_ordered_last;
- }
-
- INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
- kmp_sched_t schedule, T lb, T ub, ST st,
- ST chunk) {
- if (isRuntimeUninitialized()) {
- // In SPMD mode no need to check parallelism level - dynamic scheduling
- // may appear only in L2 parallel regions with lightweight runtime.
- ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
- return;
- }
- int tid = GetLogicalThreadIdInBlock();
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
- T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
- T tripCount = ub - lb + 1; // +1 because ub is inclusive
- ASSERT0(LT_FUSSY, threadId < tnum,
- "current thread is not needed here; error");
-
- /* Currently just ignore the monotonic and non-monotonic modifiers
- * (the compiler isn't producing them * yet anyway).
- * When it is we'll want to look at them somewhere here and use that
- * information to add to our schedule choice. We shouldn't need to pass
- * them on, they merely affect which schedule we can legally choose for
- * various dynamic cases. (In particular, whether or not a stealing scheme
- * is legal).
- */
- schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
-
- // Process schedule.
- if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
- if (OrderedSchedule(schedule))
- __kmpc_barrier(loc, threadId);
- PRINT(LD_LOOP,
- "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
- (long)tnum, (long long)tripCount, (int)schedule);
- schedule = kmp_sched_static_chunk;
- chunk = tripCount; // one thread gets the whole loop
- } else if (schedule == kmp_sched_runtime) {
- // process runtime
- omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
- chunk = currTaskDescr->RuntimeChunkSize();
- switch (rtSched) {
- case omp_sched_static: {
- if (chunk > 0)
- schedule = kmp_sched_static_chunk;
- else
- schedule = kmp_sched_static_nochunk;
- break;
- }
- case omp_sched_auto: {
- schedule = kmp_sched_static_chunk;
- chunk = 1;
- break;
- }
- case omp_sched_dynamic:
- case omp_sched_guided: {
- schedule = kmp_sched_dynamic;
- break;
- }
- }
- PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
- (long long)chunk);
- } else if (schedule == kmp_sched_auto) {
- schedule = kmp_sched_static_chunk;
- chunk = 1;
- PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
- (long long)chunk);
- } else {
- PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
- (long long)chunk);
- ASSERT(LT_FUSSY,
- schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
- "unknown schedule %d & chunk %lld\n", (int)schedule,
- (long long)chunk);
- }
-
- // init schedules
- if (schedule == kmp_sched_static_chunk) {
- ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
- // save sched state
- omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
- // save ub
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
- // compute static chunk
- ST stride;
- int lastiter = 0;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
- // save computed params
- omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
- omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
- PRINT(LD_LOOP,
- "dispatch init (static chunk) : num threads = %d, ub = %" PRId64
- ", next lower bound = %llu, stride = %llu\n",
- (int)tnum,
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
- (unsigned long long)
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
- (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
- tid));
- } else if (schedule == kmp_sched_static_balanced_chunk) {
- ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
- // save sched state
- omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
- // save ub
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
- // compute static chunk
- ST stride;
- int lastiter = 0;
- // round up to make sure the chunk is enough to cover all iterations
- T span = (tripCount + tnum - 1) / tnum;
- // perform chunk adjustment
- chunk = (span + chunk - 1) & ~(chunk - 1);
-
- T oldUb = ub;
- ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
- ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
- if (ub > oldUb)
- ub = oldUb;
- // save computed params
- omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
- omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
- PRINT(LD_LOOP,
- "dispatch init (static chunk) : num threads = %d, ub = %" PRId64
- ", next lower bound = %llu, stride = %llu\n",
- (int)tnum,
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
- (unsigned long long)
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
- (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
- tid));
- } else if (schedule == kmp_sched_static_nochunk) {
- ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
- // save sched state
- omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
- // save ub
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
- // compute static chunk
- ST stride;
- int lastiter = 0;
- ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
- // save computed params
- omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
- omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
- PRINT(LD_LOOP,
- "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
- ", next lower bound = %llu, stride = %llu\n",
- (int)tnum,
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
- (unsigned long long)
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
- (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
- tid));
- } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
- // save data
- omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
- if (chunk < 1)
- chunk = 1;
- omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
- __kmpc_barrier(loc, threadId);
- if (tid == 0) {
- omptarget_nvptx_threadPrivateContext->Cnt() = 0;
- __kmpc_impl_threadfence_block();
- }
- __kmpc_barrier(loc, threadId);
- PRINT(LD_LOOP,
- "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
- ", chunk %" PRIu64 "\n",
- (int)tnum,
- (unsigned long long)
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
- omptarget_nvptx_threadPrivateContext->Chunk(tid));
- }
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // Support for dispatch next
-
- INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val,
- int leader) {
- uint32_t lo, hi;
- __kmpc_impl_unpack(val, lo, hi);
- hi = __kmpc_impl_shfl_sync(active, hi, leader);
- lo = __kmpc_impl_shfl_sync(active, lo, leader);
- return __kmpc_impl_pack(lo, hi);
- }
-
- INLINE static uint64_t NextIter() {
- __kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
- uint32_t leader = __kmpc_impl_ffs(active) - 1;
- uint32_t change = __kmpc_impl_popc(active);
- __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
- unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
- uint64_t warp_res;
- if (rank == 0) {
- warp_res = __kmpc_atomic_add(
- (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
- (unsigned long long)change);
- }
- warp_res = Shuffle(active, warp_res, leader);
- return warp_res + rank;
- }
-
- INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
- T loopLowerBound, T loopUpperBound) {
- T N = NextIter();
- lb = loopLowerBound + N * chunkSize;
- ub = lb + chunkSize - 1; // Clang uses i <= ub
-
- // 3 result cases:
- // a. lb and ub < loopUpperBound --> NOT_FINISHED
- // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
- // NOT_FINISHED
- // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
- // a.
- if (lb <= loopUpperBound && ub < loopUpperBound) {
- PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
- (long long)lb, (long long)ub, (long long)loopUpperBound);
- return NOT_FINISHED;
- }
- // b.
- if (lb <= loopUpperBound) {
- PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
- (long long)lb, (long long)ub, (long long)loopUpperBound);
- ub = loopUpperBound;
- return LAST_CHUNK;
- }
- // c. if we are here, we are in case 'c'
- lb = loopUpperBound + 2;
- ub = loopUpperBound + 1;
- PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
- (long long)ub, (long long)loopUpperBound);
- return FINISHED;
- }
-
- INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
- T *plower, T *pupper, ST *pstride) {
- if (isRuntimeUninitialized()) {
- // In SPMD mode no need to check parallelism level - dynamic scheduling
- // may appear only in L2 parallel regions with lightweight runtime.
- ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
- if (*plast)
- return DISPATCH_FINISHED;
- *plast = 1;
- return DISPATCH_NOTFINISHED;
- }
- // ID of a thread in its own warp
-
- // automatically selects thread or warp ID based on selected implementation
- int tid = GetLogicalThreadIdInBlock();
- ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
- "current thread is not needed here; error");
- // retrieve schedule
- kmp_sched_t schedule =
- omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
-
- // xxx reduce to one
- if (schedule == kmp_sched_static_chunk ||
- schedule == kmp_sched_static_nochunk) {
- T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
- T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
- // finished?
- if (myLb > ub) {
- PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
- (long long)myLb, (long long)ub);
- return DISPATCH_FINISHED;
- }
- // not finished, save current bounds
- ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
- *plower = myLb;
- T myUb = myLb + chunk - 1; // Clang uses i <= ub
- if (myUb > ub)
- myUb = ub;
- *pupper = myUb;
- *plast = (int32_t)(myUb == ub);
-
- // increment next lower bound by the stride
- ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
- PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
- (long long)*plower, (long long)*pupper);
- return DISPATCH_NOTFINISHED;
- }
- ASSERT0(LT_FUSSY,
- schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
- "bad sched");
- T myLb, myUb;
- int finished = DynamicNextChunk(
- myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
- omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
- omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
-
- if (finished == FINISHED)
- return DISPATCH_FINISHED;
-
- // not finished (either not finished or last chunk)
- *plast = (int32_t)(finished == LAST_CHUNK);
- *plower = myLb;
- *pupper = myUb;
- *pstride = 1;
-
- PRINT(LD_LOOP,
- "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
- "last %d\n",
- (int)GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
- (int)GetNumberOfWorkersInTeam(), (long long)*plower,
- (long long)*pupper, (long long)*pstride, (int)*plast);
- return DISPATCH_NOTFINISHED;
- }
-
- INLINE static void dispatch_fini() {
- // nothing
- }
-
- ////////////////////////////////////////////////////////////////////////////////
- // end of template class that encapsulate all the helper functions
- ////////////////////////////////////////////////////////////////////////////////
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (dyn loops)
-////////////////////////////////////////////////////////////////////////////////
-
-// init
-EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
- int32_t schedule, int32_t lb, int32_t ub,
- int32_t st, int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
- int32_t schedule, uint32_t lb, uint32_t ub,
- int32_t st, int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
- int32_t schedule, int64_t lb, int64_t ub,
- int64_t st, int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
- int32_t schedule, uint64_t lb, uint64_t ub,
- int64_t st, int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
- loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
-}
-
-// next
-EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
- int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
- PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
- return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
- uint32_t *p_lb, uint32_t *p_ub,
- int32_t *p_st) {
- PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
- return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
- int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
- PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
- return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
- uint64_t *p_lb, uint64_t *p_ub,
- int64_t *p_st) {
- PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
- return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
- loc, tid, p_last, p_lb, p_ub, p_st);
-}
-
-// fini
-EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
- PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
- PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
- PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
-}
-
-EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
- PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP interface implementation (static loops)
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype,
- int32_t *plastiter, int32_t *plower,
- int32_t *pupper, int32_t *pstride,
- int32_t incr, int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_distribute_static_init_4\n");
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype,
- int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_distribute_static_init_4u\n");
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype,
- int32_t *plastiter, int64_t *plower,
- int64_t *pupper, int64_t *pstride,
- int64_t incr, int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_distribute_static_init_8\n");
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype,
- int32_t *plastiter,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_distribute_static_init_8u\n");
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- __kmpc_is_spmd_exec_mode());
-}
-
-EXTERN
-void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype,
- int32_t *plastiter, uint32_t *plower,
- uint32_t *pupper, int32_t *pstride,
- int32_t incr, int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype,
- int32_t *plastiter, uint64_t *plower,
- uint64_t *pupper, int64_t *pstride,
- int64_t incr, int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/true);
-}
-
-EXTERN
-void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype,
- int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
- omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_4u_simple_generic(
- kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
- int32_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
- omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid,
- int32_t schedtype,
- int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
- omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN
-void __kmpc_for_static_init_8u_simple_generic(
- kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
- uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
- int64_t chunk) {
- PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
- omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
- global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
- /*IsSPMDExecutionMode=*/false);
-}
-
-EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid) {
- PRINT0(LD_IO, "call kmpc_distribute_static_fini\n");
-}
-
-EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
- PRINT0(LD_IO, "call kmpc_for_static_fini\n");
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
deleted file mode 100644
index aab16a31bb085..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the data objects used on the GPU device.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/allocator.h"
-#include "common/omptarget.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// global device environment
-////////////////////////////////////////////////////////////////////////////////
-
-PLUGIN_ACCESSIBLE
-DeviceEnvironmentTy omptarget_device_environment;
-
-////////////////////////////////////////////////////////////////////////////////
-// global data holding OpenMP state information
-////////////////////////////////////////////////////////////////////////////////
-
-// OpenMP will try to call its ctor if we don't add the attribute explicitly
-[[clang::loader_uninitialized]] omptarget_nvptx_Queue<
- omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
- omptarget_nvptx_device_State[MAX_SM];
-
-omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
-uint32_t SHARED(usedMemIdx);
-uint32_t SHARED(usedSlotIdx);
-
-// SHARED doesn't work with array so we add the attribute explicitly.
-[[clang::loader_uninitialized]] uint8_t
- parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
-uint16_t SHARED(threadLimit);
-uint16_t SHARED(threadsInTeam);
-uint16_t SHARED(nThreads);
-// Pointer to this team's OpenMP state object
-omptarget_nvptx_ThreadPrivateContext *
- SHARED(omptarget_nvptx_threadPrivateContext);
-
-////////////////////////////////////////////////////////////////////////////////
-// The team master sets the outlined parallel function in this variable to
-// communicate with the workers. Since it is in shared memory, there is one
-// copy of these variables for each kernel, instance, and team.
-////////////////////////////////////////////////////////////////////////////////
-omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);
-
-////////////////////////////////////////////////////////////////////////////////
-// OpenMP kernel execution parameters
-////////////////////////////////////////////////////////////////////////////////
-int8_t SHARED(execution_param);
-
-////////////////////////////////////////////////////////////////////////////////
-// Scratchpad for teams reduction.
-////////////////////////////////////////////////////////////////////////////////
-void *SHARED(ReductionScratchpadPtr);
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
deleted file mode 100644
index 8862026e69efb..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ /dev/null
@@ -1,259 +0,0 @@
-//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the initialization code for the GPU
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "common/support.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// global data tables
-////////////////////////////////////////////////////////////////////////////////
-
-extern omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext,
- OMP_STATE_COUNT>
- omptarget_nvptx_device_State[MAX_SM];
-
-////////////////////////////////////////////////////////////////////////////////
-// init entry points
-////////////////////////////////////////////////////////////////////////////////
-
-static void __kmpc_generic_kernel_init() {
- PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
- OMPTARGET_NVPTX_VERSION);
-
- if (GetLaneId() == 0)
- parallelLevel[GetWarpId()] = 0;
-
- int threadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
- if (threadIdInBlock != GetMasterThreadID())
- return;
-
- setExecutionParameters(OMP_TGT_EXEC_MODE_GENERIC, OMP_TGT_RUNTIME_INITIALIZED);
- ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
- "__kmpc_kernel_init() must be called by team master warp only!");
- PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
-
- // Get a state object from the queue.
- int slot = __kmpc_impl_smid() % MAX_SM;
- usedSlotIdx = slot;
- omptarget_nvptx_threadPrivateContext =
- omptarget_nvptx_device_State[slot].Dequeue();
-
- // init thread private
- int threadId = 0;
- omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
-
- // init team context
- omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
- currTeamDescr.InitTeamDescr();
- // this thread will start execution... has to update its task ICV
- // to point to the level zero task ICV. That ICV was init in
- // InitTeamDescr()
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
- threadId, currTeamDescr.LevelZeroTaskDescr());
-
- // set number of threads and thread limit in team to started value
- omptarget_nvptx_TaskDescr *currTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
- nThreads = GetNumberOfWorkersInTeam();
- threadLimit = nThreads;
-
- __kmpc_data_sharing_init_stack();
- __kmpc_impl_target_init();
-}
-
-static void __kmpc_generic_kernel_deinit() {
- PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");
- // Enqueue omp state object for use by another team.
- int slot = usedSlotIdx;
- omptarget_nvptx_device_State[slot].Enqueue(
- omptarget_nvptx_threadPrivateContext);
- // Done with work. Kill the workers.
- omptarget_nvptx_workFn = 0;
-}
-
-static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) {
- PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
-
- setExecutionParameters(OMP_TGT_EXEC_MODE_SPMD,
- RequiresFullRuntime ? OMP_TGT_RUNTIME_INITIALIZED
- : OMP_TGT_RUNTIME_UNINITIALIZED);
- int threadId = __kmpc_get_hardware_thread_id_in_block();
- if (threadId == 0) {
- usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
- }
-
- if (GetLaneId() == 0) {
- parallelLevel[GetWarpId()] =
- 1 + (__kmpc_get_hardware_num_threads_in_block() > 1
- ? OMP_ACTIVE_PARALLEL_LEVEL
- : 0);
- }
-
- __kmpc_data_sharing_init_stack();
- if (!RequiresFullRuntime)
- return;
-
- //
- // Team Context Initialization.
- //
- // In SPMD mode there is no master thread so use any cuda thread for team
- // context initialization.
- if (threadId == 0) {
- // Get a state object from the queue.
- omptarget_nvptx_threadPrivateContext =
- omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
-
- omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
- omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
- // init team context
- currTeamDescr.InitTeamDescr();
- }
- __kmpc_impl_syncthreads();
-
- omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
- omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
-
- //
- // Initialize task descr for each thread.
- //
- omptarget_nvptx_TaskDescr *newTaskDescr =
- omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
- ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
- newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
- // install new top descriptor
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
- newTaskDescr);
-
- // init thread private from init value
- int ThreadLimit = GetNumberOfProcsInTeam(/* IsSPMD */ true);
- PRINT(LD_PAR,
- "thread will execute parallel region with id %d in a team of "
- "%d threads\n",
- (int)newTaskDescr->ThreadId(), (int)ThreadLimit);
-}
-
-static void __kmpc_spmd_kernel_deinit(bool RequiresFullRuntime) {
- // We're not going to pop the task descr stack of each thread since
- // there are no more parallel regions in SPMD mode.
- if (!RequiresFullRuntime)
- return;
-
- __kmpc_impl_syncthreads();
- int threadId = __kmpc_get_hardware_thread_id_in_block();
- if (threadId == 0) {
- // Enqueue omp state object for use by another team.
- int slot = usedSlotIdx;
- omptarget_nvptx_device_State[slot].Enqueue(
- omptarget_nvptx_threadPrivateContext);
- }
-}
-
-// Return true if the current target region is executed in SPMD mode.
-// NOTE: This function has to return 1 for SPMD mode, and 0 for generic mode.
-// That's because `__kmpc_parallel_51` checks if it's already in parallel region
-// by comparision between the parallel level and the return value of this
-// function.
-EXTERN int8_t __kmpc_is_spmd_exec_mode() {
- return (execution_param & OMP_TGT_EXEC_MODE_SPMD) == OMP_TGT_EXEC_MODE_SPMD;
-}
-
-EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) {
- return !__kmpc_is_spmd_exec_mode() && __kmpc_is_generic_main_thread_id(Tid);
-}
-
-NOINLINE EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid) {
- return GetMasterThreadID() == Tid;
-}
-
-EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
-
-static void __kmpc_target_region_state_machine(ident_t *Ident) {
-
- int TId = __kmpc_get_hardware_thread_id_in_block();
- do {
- void* WorkFn = 0;
-
- // Wait for the signal that we have a new work function.
- __kmpc_barrier_simple_spmd(Ident, TId);
-
-
- // Retrieve the work function from the runtime.
- bool IsActive = __kmpc_kernel_parallel(&WorkFn);
-
- // If there is nothing more to do, break out of the state machine by
- // returning to the caller.
- if (!WorkFn)
- return;
-
- if (IsActive) {
- ((void(*)(uint32_t,uint32_t))WorkFn)(0, TId);
- __kmpc_kernel_end_parallel();
- }
-
- __kmpc_barrier_simple_spmd(Ident, TId);
-
- } while (true);
-}
-
-EXTERN
-int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
- bool UseGenericStateMachine,
- bool RequiresFullRuntime) {
- const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
- int TId = __kmpc_get_hardware_thread_id_in_block();
- if (IsSPMD)
- __kmpc_spmd_kernel_init(RequiresFullRuntime);
- else
- __kmpc_generic_kernel_init();
-
- if (IsSPMD) {
- __kmpc_barrier_simple_spmd(Ident, TId);
- return -1;
- }
-
- if (TId == GetMasterThreadID())
- return -1;
-
- // Enter the generic state machine if enabled and if this thread can possibly
- // be an active worker thread.
- //
- // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
- // GPU. In those cases, a single thread can apparently satisfy a barrier on
- // behalf of all threads in the same warp. Thus, it would not be safe for
- // other threads in the main thread's warp to reach the first
- // __kmpc_barrier_simple_spmd call in __kmpc_target_region_state_machine
- // before the main thread reaches its corresponding
- // __kmpc_barrier_simple_spmd call: that would permit all active worker
- // threads to proceed before the main thread has actually set
- // omptarget_nvptx_workFn, and then they would immediately quit without
- // doing any work. GetNumberOfWorkersInTeam() does not include any of the
- // main thread's warp, so none of its threads can ever be active worker
- // threads.
- if (UseGenericStateMachine && TId < GetNumberOfWorkersInTeam())
- __kmpc_target_region_state_machine(Ident);
-
- return TId;
-}
-
-EXTERN
-void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
- bool RequiresFullRuntime) {
- const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
- if (IsSPMD)
- __kmpc_spmd_kernel_deinit(RequiresFullRuntime);
- else
- __kmpc_generic_kernel_deinit();
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
deleted file mode 100644
index 4b27529473dcc..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ /dev/null
@@ -1,341 +0,0 @@
-//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Parallel implementation in the GPU. Here is the pattern:
-//
-// while (not finished) {
-//
-// if (master) {
-// sequential code, decide which par loop to do, or if finished
-// __kmpc_kernel_prepare_parallel() // exec by master only
-// }
-// syncthreads // A
-// __kmpc_kernel_parallel() // exec by all
-// if (this thread is included in the parallel) {
-// switch () for all parallel loops
-// __kmpc_kernel_end_parallel() // exec only by threads in parallel
-// }
-//
-//
-// The reason we don't exec end_parallel for the threads not included
-// in the parallel loop is that for each barrier in the parallel
-// region, these non-included threads will cycle through the
-// syncthread A. Thus they must preserve their current threadId that
-// is larger than thread in team.
-//
-// To make a long story short...
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// support for parallel that goes parallel (1 static level only)
-////////////////////////////////////////////////////////////////////////////////
-
-INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
- uint16_t NThreadsICV,
- uint16_t ThreadLimit) {
- uint16_t ThreadsRequested = NThreadsICV;
- if (NumThreadsClause != 0) {
- ThreadsRequested = NumThreadsClause;
- }
-
- uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
- if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
- ThreadsAvailable = ThreadLimit;
- }
-
- uint16_t NumThreads = ThreadsAvailable;
- if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
- NumThreads = ThreadsRequested;
- }
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- // On Volta and newer architectures we require that all lanes in
- // a warp participate in the parallel region. Round down to a
- // multiple of WARPSIZE since it is legal to do so in OpenMP.
- if (NumThreads < WARPSIZE) {
- NumThreads = 1;
- } else {
- NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
- }
-#endif
-
- return NumThreads;
-}
-
-// This routine is always called by the team master..
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
- kmp_int32 NumThreadsClause) {
- PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
-
- omptarget_nvptx_workFn = WorkFn;
-
- // This routine is only called by the team master. The team master is
- // the first thread of the last warp. It always has the logical thread
- // id of 0 (since it is a shadow for the first worker thread).
- const int threadId = 0;
- omptarget_nvptx_TaskDescr *currTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
- ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
- ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
- "cannot be called in a parallel region.");
- if (currTaskDescr->InParallelRegion()) {
- PRINT0(LD_PAR, "already in parallel: go seq\n");
- return;
- }
-
- uint16_t NumThreads =
- determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
-
- if (NumThreadsClause != 0) {
- // Reset request to avoid propagating to successive #parallel
- NumThreadsClause = 0;
- }
-
- ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
- (int)NumThreads);
- ASSERT0(LT_FUSSY,
- __kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
- "only team master can create parallel");
-
- // Set number of threads on work descriptor.
- omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
- workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
- threadsInTeam = NumThreads;
-}
-
-// All workers call this function. Deactivate those not needed.
-// Fn - the outlined work function to execute.
-// returns True if this thread is active, else False.
-//
-// Only the worker threads call this routine.
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
- PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
-
- // Work function and arguments for L1 parallel region.
- *WorkFn = omptarget_nvptx_workFn;
-
- // If this is the termination signal from the master, quit early.
- if (!*WorkFn) {
- PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
- return false;
- }
-
- // Only the worker threads call this routine and the master warp
- // never arrives here. Therefore, use the nvptx thread id.
- int threadId = __kmpc_get_hardware_thread_id_in_block();
- omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
- // Set to true for workers participating in the parallel region.
- bool isActive = false;
- // Initialize state for active threads.
- if (threadId < threadsInTeam) {
- // init work descriptor from workdesccr
- omptarget_nvptx_TaskDescr *newTaskDescr =
- omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
- ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
- newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
- // install new top descriptor
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
- newTaskDescr);
- // init private from int value
- PRINT(LD_PAR,
- "thread will execute parallel region with id %d in a team of "
- "%d threads\n",
- (int)newTaskDescr->ThreadId(), (int)nThreads);
-
- isActive = true;
- }
-
- return isActive;
-}
-
-EXTERN void __kmpc_kernel_end_parallel() {
- // pop stack
- PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
- ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-
- // Only the worker threads call this routine and the master warp
- // never arrives here. Therefore, use the nvptx thread id.
- int threadId = __kmpc_get_hardware_thread_id_in_block();
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
- threadId, currTaskDescr->GetPrevTaskDescr());
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// support for parallel that goes sequential
-////////////////////////////////////////////////////////////////////////////////
-
-static void serializedParallel(kmp_Ident *loc, uint32_t global_tid) {
- PRINT0(LD_IO, "call to serializedParallel\n");
-
- IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
-
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
- "Expected SPMD mode with uninitialized runtime.");
- return;
- }
-
- // assume this is only called for nested parallel
- int threadId = GetLogicalThreadIdInBlock();
-
- // unlike actual parallel, threads in the same team do not share
- // the workTaskDescr in this case and num threads is fixed to 1
-
- // get current task
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
- currTaskDescr->SaveLoopData();
-
- // allocate new task descriptor and copy value from current one, set prev to
- // it
- omptarget_nvptx_TaskDescr *newTaskDescr =
- (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
- "new seq parallel task");
- newTaskDescr->CopyParent(currTaskDescr);
-
- // tweak values for serialized parallel case:
- // - each thread becomes ID 0 in its serialized parallel, and
- // - there is only one thread per team
- newTaskDescr->ThreadId() = 0;
-
- // set new task descriptor as top
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
- newTaskDescr);
-}
-
-static void endSerializedParallel(kmp_Ident *loc,
- uint32_t global_tid) {
- PRINT0(LD_IO, "call to endSerializedParallel\n");
-
- DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
-
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
- "Expected SPMD mode with uninitialized runtime.");
- return;
- }
-
- // pop stack
- int threadId = GetLogicalThreadIdInBlock();
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
- // set new top
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
- threadId, currTaskDescr->GetPrevTaskDescr());
- // free
- SafeFree(currTaskDescr, "new seq parallel task");
- currTaskDescr = getMyTopTaskDescriptor(threadId);
- currTaskDescr->RestoreLoopData();
-}
-
-NOINLINE EXTERN uint8_t __kmpc_parallel_level() {
- return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
-}
-
-// This kmpc call returns the thread id across all teams. It's value is
-// cached by the compiler and used when calling the runtime. On nvptx
-// it's cheap to recalculate this value so we never use the result
-// of this call.
-EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
- return GetOmpThreadId();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// push params
-////////////////////////////////////////////////////////////////////////////////
-
-// Do nothing. The host guarantees we started the requested number of
-// teams and we only need inspection of gridDim.
-
-EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
- int32_t num_teams, int32_t thread_limit) {
- PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
- ASSERT0(LT_FUSSY, 0, "should never have anything with new teams on device");
-}
-
-EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) {
- PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// parallel interface
-////////////////////////////////////////////////////////////////////////////////
-
-NOINLINE EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid,
- kmp_int32 if_expr,
- kmp_int32 num_threads, int proc_bind,
- void *fn, void *wrapper_fn, void **args,
- size_t nargs) {
- // Handle the serialized case first, same for SPMD/non-SPMD except that in
- // SPMD mode we already incremented the parallel level counter, account for
- // that.
- bool InParallelRegion =
- (__kmpc_parallel_level() > __kmpc_is_spmd_exec_mode());
- if (!if_expr || InParallelRegion) {
- serializedParallel(ident, global_tid);
- __kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
- endSerializedParallel(ident, global_tid);
- return;
- }
-
- if (__kmpc_is_spmd_exec_mode()) {
- __kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
- return;
- }
-
- __kmpc_kernel_prepare_parallel((void *)wrapper_fn, num_threads);
-
- if (nargs) {
- void **GlobalArgs;
- __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
- // TODO: faster memcpy?
-#pragma unroll
- for (int I = 0; I < nargs; I++)
- GlobalArgs[I] = args[I];
- }
-
- // TODO: what if that's a parallel region with a single thread? this is
- // considered not active in the existing implementation.
- bool IsActiveParallelRegion = threadsInTeam != 1;
- int NumWarps =
- threadsInTeam / WARPSIZE + ((threadsInTeam % WARPSIZE) ? 1 : 0);
- // Increment parallel level for non-SPMD warps.
- for (int I = 0; I < NumWarps; ++I)
- parallelLevel[I] +=
- (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
-
- // Master signals work to activate workers.
- __kmpc_barrier_simple_spmd(ident, 0);
-
- // OpenMP [2.5, Parallel Construct, p.49]
- // There is an implied barrier at the end of a parallel region. After the
- // end of a parallel region, only the master thread of the team resumes
- // execution of the enclosing task region.
- //
- // The master waits at this barrier until all workers are done.
- __kmpc_barrier_simple_spmd(ident, 0);
-
- // Decrement parallel level for non-SPMD warps.
- for (int I = 0; I < NumWarps; ++I)
- parallelLevel[I] -=
- (1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
- // TODO: Is synchronization needed since out of parallel execution?
-
- if (nargs)
- __kmpc_end_sharing_variables();
-
- // TODO: proc_bind is a noop?
- // if (proc_bind != proc_bind_default)
- // __kmpc_push_proc_bind(ident, global_tid, proc_bind);
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
deleted file mode 100644
index e975c8d140186..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ /dev/null
@@ -1,309 +0,0 @@
-//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of reduction with KMPC interface.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target/shuffle.h"
-#include "target_impl.h"
-
-EXTERN
-void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
-
-EXTERN
-void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
-
-INLINE static void gpu_regular_warp_reduce(void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct) {
- for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
- shflFct(reduce_data, /*LaneId - not used= */ 0,
- /*Offset = */ mask, /*AlgoVersion=*/0);
- }
-}
-
-INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct,
- uint32_t size, uint32_t tid) {
- uint32_t curr_size;
- uint32_t mask;
- curr_size = size;
- mask = curr_size / 2;
- while (mask > 0) {
- shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
- curr_size = (curr_size + 1) / 2;
- mask = curr_size / 2;
- }
-}
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
-INLINE static uint32_t
-gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
- uint32_t size, remote_id, physical_lane_id;
- physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE;
- __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
- __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
- uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
- __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
- do {
- Liveness = __kmpc_impl_activemask();
- remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
- size = __kmpc_impl_popc(Liveness);
- logical_lane_id /= 2;
- shflFct(reduce_data, /*LaneId =*/logical_lane_id,
- /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
- } while (logical_lane_id % 2 == 0 && size > 1);
- return (logical_lane_id == 0);
-}
-#endif
-
-INLINE
-static int32_t nvptx_parallel_reduce_nowait(
- int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
- kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
- bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
- uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
- uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
- if (NumThreads == 1)
- return 1;
- /*
- * This reduce function handles reduction within a team. It handles
- * parallel regions in both L1 and L2 parallelism levels. It also
- * supports Generic, SPMD, and NoOMP modes.
- *
- * 1. Reduce within a warp.
- * 2. Warp master copies value to warp 0 via shared memory.
- * 3. Warp 0 reduces to a single value.
- * 4. The reduced value is available in the thread that returns 1.
- */
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
- uint32_t WarpId = BlockThreadId / WARPSIZE;
-
- // Volta execution model:
- // For the Generic execution mode a parallel region either has 1 thread and
- // beyond that, always a multiple of 32. For the SPMD execution mode we may
- // have any number of threads.
- if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
- gpu_irregular_warp_reduce(
- reduce_data, shflFct,
- /*LaneCount=*/NumThreads % WARPSIZE,
- /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
-
- // When we have more than [warpsize] number of threads
- // a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
- if (NumThreads > WARPSIZE) {
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
- }
- return BlockThreadId == 0;
-#else
- __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
- if (Liveness == __kmpc_impl_all_lanes) // Full warp
- gpu_regular_warp_reduce(reduce_data, shflFct);
- else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
- gpu_irregular_warp_reduce(
- reduce_data, shflFct,
- /*LaneCount=*/__kmpc_impl_popc(Liveness),
- /*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
- else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
- // parallel region may enter here; return
- // early.
- return gpu_irregular_simd_reduce(reduce_data, shflFct);
-
- // When we have more than [warpsize] number of threads
- // a block reduction is performed here.
- //
- // Only L1 parallel region can enter this if condition.
- if (NumThreads > WARPSIZE) {
- uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- uint32_t WarpId = BlockThreadId / WARPSIZE;
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- BlockThreadId);
-
- return BlockThreadId == 0;
- } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
- return BlockThreadId == 0;
- }
-
- // Get the OMP thread Id. This is
diff erent from BlockThreadId in the case of
- // an L2 parallel region.
- return global_tid == 0;
-#endif // __CUDA_ARCH__ >= 700
-}
-
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
- kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
- void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
- kmp_InterWarpCopyFctPtr cpyFct) {
- return nvptx_parallel_reduce_nowait(
- global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
- __kmpc_is_spmd_exec_mode(), isRuntimeUninitialized());
-}
-
-INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
- return !__kmpc_is_spmd_exec_mode() || IsTeamMaster(ThreadId);
-}
-
-INLINE static uint32_t roundToWarpsize(uint32_t s) {
- if (s < WARPSIZE)
- return 1;
- return (s & ~(unsigned)(WARPSIZE - 1));
-}
-
-INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
-
-static volatile uint32_t IterCnt = 0;
-static volatile uint32_t Cnt = 0;
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
- kmp_Ident *loc, int32_t global_tid, void *global_buffer,
- int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
- kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
- kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
- kmp_ListGlobalFctPtr glredFct) {
-
- // Terminate all threads in non-SPMD mode except for the master thread.
- if (!__kmpc_is_spmd_exec_mode() &&
- !__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block()))
- return 0;
-
- uint32_t ThreadId = GetLogicalThreadIdInBlock();
-
- // In non-generic mode all workers participate in the teams reduction.
- // In generic mode only the team master participates in the teams
- // reduction because the workers are waiting for parallel work.
- uint32_t NumThreads =
- __kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
- : /*Master thread only*/ 1;
- uint32_t TeamId = GetBlockIdInKernel();
- uint32_t NumTeams = __kmpc_get_hardware_num_blocks();
- static unsigned SHARED(Bound);
- static unsigned SHARED(ChunkTeamCount);
-
- // Block progress for teams greater than the current upper
- // limit. We always only allow a number of teams less or equal
- // to the number of slots in the buffer.
- bool IsMaster = isMaster(loc, ThreadId);
- while (IsMaster) {
- // Atomic read
- Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
- if (TeamId < Bound + num_of_records)
- break;
- }
-
- if (IsMaster) {
- int ModBockId = TeamId % num_of_records;
- if (TeamId < num_of_records)
- lgcpyFct(global_buffer, ModBockId, reduce_data);
- else
- lgredFct(global_buffer, ModBockId, reduce_data);
- __kmpc_impl_threadfence_system();
-
- // Increment team counter.
- // This counter is incremented by all teams in the current
- // BUFFER_SIZE chunk.
- ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
- }
- // Synchronize
- if (__kmpc_is_spmd_exec_mode())
- __kmpc_barrier(loc, global_tid);
-
- // reduce_data is global or shared so before being reduced within the
- // warp we need to bring it in local memory:
- // local_reduce_data = reduce_data[i]
- //
- // Example for 3 reduction variables a, b, c (of potentially
diff erent
- // types):
- //
- // buffer layout (struct of arrays):
- // a, a, ..., a, b, b, ... b, c, c, ... c
- // |__________|
- // num_of_records
- //
- // local_data_reduce layout (struct):
- // a, b, c
- //
- // Each thread will have a local struct containing the values to be
- // reduced:
- // 1. do reduction within each warp.
- // 2. do reduction across warps.
- // 3. write the final result to the main reduction variable
- // by returning 1 in the thread holding the reduction result.
-
- // Check if this is the very last team.
- unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
- if (ChunkTeamCount == NumTeams - Bound - 1) {
- //
- // Last team processing.
- //
- if (ThreadId >= NumRecs)
- return 0;
- NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
- if (ThreadId >= NumThreads)
- return 0;
-
- // Load from buffer and reduce.
- glcpyFct(global_buffer, ThreadId, reduce_data);
- for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
- glredFct(global_buffer, i, reduce_data);
-
- // Reduce across warps to the warp master.
- if (NumThreads > 1) {
- gpu_regular_warp_reduce(reduce_data, shflFct);
-
- // When we have more than [warpsize] number of threads
- // a block reduction is performed here.
- uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
- if (ActiveThreads > WARPSIZE) {
- uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
- // Gather all the reduced values from each warp
- // to the first warp.
- cpyFct(reduce_data, WarpsNeeded);
-
- uint32_t WarpId = ThreadId / WARPSIZE;
- if (WarpId == 0)
- gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
- ThreadId);
- }
- }
-
- if (IsMaster) {
- Cnt = 0;
- IterCnt = 0;
- return 1;
- }
- return 0;
- }
- if (IsMaster && ChunkTeamCount == num_of_records - 1) {
- // Allow SIZE number of teams to proceed writing their
- // intermediate results to the global buffer.
- __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
- }
-
- return 0;
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp b/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp
deleted file mode 100644
index 9cb49c77182c7..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/shuffle.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-//===--- shuffle.cpp - Implementation of the external shuffle idiom API -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "target/shuffle.h"
-
-#pragma omp declare target
-
-static constexpr uint64_t AllLanes = -1;
-
-int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
- return __kmpc_impl_shfl_down_sync(AllLanes, val, delta, size);
-}
-
-int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
- uint32_t lo, hi;
- __kmpc_impl_unpack(val, lo, hi);
- hi = __kmpc_impl_shfl_down_sync(AllLanes, hi, delta, size);
- lo = __kmpc_impl_shfl_down_sync(AllLanes, lo, delta, size);
- return __kmpc_impl_pack(lo, hi);
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
deleted file mode 100644
index e6241a424a911..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ /dev/null
@@ -1,240 +0,0 @@
-//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Wrapper implementation to some functions natively supported by the GPU.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-#include "common/omptarget.h"
-#include "common/support.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Parameters
-////////////////////////////////////////////////////////////////////////////////
-
-void setExecutionParameters(OMPTgtExecModeFlags EMode,
- OMPTgtRuntimeModeFlags RMode) {
- execution_param = EMode;
- execution_param |= RMode;
-}
-
-bool isGenericMode() { return execution_param & OMP_TGT_EXEC_MODE_GENERIC; }
-
-bool isRuntimeUninitialized() { return !isRuntimeInitialized(); }
-
-bool isRuntimeInitialized() {
- return execution_param & OMP_TGT_RUNTIME_INITIALIZED;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// support: get info from machine
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
-//
-////////////////////////////////////////////////////////////////////////////////
-
-// The master thread id is the first thread (lane) of the last warp.
-// Thread id is 0 indexed.
-// E.g: If NumThreads is 33, master id is 32.
-// If NumThreads is 64, master id is 32.
-// If NumThreads is 97, master id is 96.
-// If NumThreads is 1024, master id is 992.
-//
-// Called in Generic Execution Mode only.
-int GetMasterThreadID() {
- return (__kmpc_get_hardware_num_threads_in_block() - 1) & ~(WARPSIZE - 1);
-}
-
-// The last warp is reserved for the master; other warps are workers.
-// Called in Generic Execution Mode only.
-int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
-
-////////////////////////////////////////////////////////////////////////////////
-// get thread id in team
-
-// This function may be called in a parallel region by the workers
-// or a serial region by the master. If the master (whose CUDA thread
-// id is GetMasterThreadID()) calls this routine, we return 0 because
-// it is a shadow for the first worker.
-int GetLogicalThreadIdInBlock() {
- // Implemented using control flow (predication) instead of with a modulo
- // operation.
- int tid = __kmpc_get_hardware_thread_id_in_block();
- if (__kmpc_is_generic_main_thread(tid))
- return 0;
- else
- return tid;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// OpenMP Thread Support Layer
-//
-////////////////////////////////////////////////////////////////////////////////
-
-int GetOmpThreadId() {
- int tid = __kmpc_get_hardware_thread_id_in_block();
- if (__kmpc_is_generic_main_thread(tid))
- return 0;
- // omp_thread_num
- int rc;
- if (__kmpc_parallel_level() > 1) {
- rc = 0;
- } else if (__kmpc_is_spmd_exec_mode()) {
- rc = tid;
- } else {
- omptarget_nvptx_TaskDescr *currTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
- ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
- rc = currTaskDescr->ThreadId();
- }
- return rc;
-}
-
-int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
- // omp_num_threads
- int rc;
- int Level = parallelLevel[GetWarpId()];
- if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
- rc = 1;
- } else if (isSPMDExecutionMode) {
- rc = __kmpc_get_hardware_num_threads_in_block();
- } else {
- rc = threadsInTeam;
- }
-
- return rc;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Team id linked to OpenMP
-
-int GetOmpTeamId() {
- // omp_team_num
- return GetBlockIdInKernel(); // assume 1 block per team
-}
-
-int GetNumberOfOmpTeams() {
- // omp_num_teams
- return __kmpc_get_hardware_num_blocks(); // assume 1 block per team
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Masters
-
-int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
-
-////////////////////////////////////////////////////////////////////////////////
-// Parallel level
-
-void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
- __kmpc_impl_syncwarp(Mask);
- __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
- unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
- if (Rank == 0) {
- parallelLevel[GetWarpId()] +=
- (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
- __kmpc_impl_threadfence();
- }
- __kmpc_impl_syncwarp(Mask);
-}
-
-void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
- __kmpc_impl_syncwarp(Mask);
- __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
- unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
- if (Rank == 0) {
- parallelLevel[GetWarpId()] -=
- (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
- __kmpc_impl_threadfence();
- }
- __kmpc_impl_syncwarp(Mask);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// get OpenMP number of procs
-
-// Get the number of processors in the device.
-int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
- if (!isSPMDExecutionMode)
- return GetNumberOfWorkersInTeam();
- return __kmpc_get_hardware_num_threads_in_block();
-}
-
-int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
- return GetNumberOfProcsInDevice(isSPMDExecutionMode);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory
-////////////////////////////////////////////////////////////////////////////////
-
-unsigned long PadBytes(unsigned long size,
- unsigned long alignment) // must be a power of 2
-{
- // compute the necessary padding to satisfy alignment constraint
- ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
- "alignment %lu is not a power of 2\n", alignment);
- return (~(unsigned long)size + 1) & (alignment - 1);
-}
-
-void *SafeMalloc(size_t size, const char *msg) // check if success
-{
- void *ptr = __kmpc_impl_malloc(size);
- PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
- (unsigned long long)size, msg, (unsigned long long)ptr);
- return ptr;
-}
-
-void *SafeFree(void *ptr, const char *msg) {
- PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
- __kmpc_impl_free(ptr);
- return NULL;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Teams Reduction Scratchpad Helpers
-////////////////////////////////////////////////////////////////////////////////
-
-unsigned int *GetTeamsReductionTimestamp() {
- return static_cast<unsigned int *>(ReductionScratchpadPtr);
-}
-
-char *GetTeamsReductionScratchpad() {
- return static_cast<char *>(ReductionScratchpadPtr) + 256;
-}
-
-// Invoke an outlined parallel function unwrapping arguments (up
-// to 32).
-void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn,
- void **args, size_t nargs) {
- switch (nargs) {
-#include "common/generated_microtask_cases.gen"
- default:
- printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
- __builtin_trap();
- }
-}
-
-namespace _OMP {
-/// Helper to keep code alive without introducing a performance penalty.
-__attribute__((used, retain, weak, optnone, cold)) void keepAlive() {
- __kmpc_get_hardware_thread_id_in_block();
- __kmpc_get_hardware_num_threads_in_block();
- __kmpc_get_warp_size();
- __kmpc_barrier_simple_spmd(nullptr, 0);
- __kmpc_barrier_simple_generic(nullptr, 0);
-}
-} // namespace _OMP
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
deleted file mode 100644
index 823c9fc1ef40b..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ /dev/null
@@ -1,143 +0,0 @@
-//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Include all synchronization.
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP Ordered calls
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) {
- PRINT0(LD_IO, "call kmpc_ordered\n");
-}
-
-EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) {
- PRINT0(LD_IO, "call kmpc_end_ordered\n");
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP Barriers
-////////////////////////////////////////////////////////////////////////////////
-
-// a team is a block: we can use CUDA native synchronization mechanism
-// FIXME: what if not all threads (warps) participate to the barrier?
-// We may need to implement it
diff erently
-
-EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
- PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
- __kmpc_barrier(loc_ref, tid);
- PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
- return 0;
-}
-
-EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
- if (isRuntimeUninitialized()) {
- ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
- "Expected SPMD mode with uninitialized runtime.");
- __kmpc_barrier_simple_spmd(loc_ref, tid);
- } else {
- tid = GetLogicalThreadIdInBlock();
- int numberOfActiveOMPThreads =
- GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
- if (numberOfActiveOMPThreads > 1) {
- if (__kmpc_is_spmd_exec_mode()) {
- __kmpc_barrier_simple_spmd(loc_ref, tid);
- } else {
- // The #threads parameter must be rounded up to the WARPSIZE.
- int threads =
- WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
- PRINT(LD_SYNC,
- "call kmpc_barrier with %d omp threads, sync parameter %d\n",
- (int)numberOfActiveOMPThreads, (int)threads);
- __kmpc_impl_named_sync(threads);
- }
- } else {
- // Still need to flush the memory per the standard.
- __kmpc_flush(loc_ref);
- } // numberOfActiveOMPThreads > 1
- PRINT0(LD_SYNC, "completed kmpc_barrier\n");
- }
-}
-
-// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0
-// parallel region and that all worker threads participate.
-EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
- PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
- __kmpc_impl_syncthreads();
- PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
-}
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
- return __kmpc_barrier_simple_spmd(loc_ref, tid);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP MASTER
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
- PRINT0(LD_IO, "call kmpc_master\n");
- return IsTeamMaster(global_tid);
-}
-
-EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
- PRINT0(LD_IO, "call kmpc_end_master\n");
- ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// KMP SINGLE
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
- PRINT0(LD_IO, "call kmpc_single\n");
- // decide to implement single with master; master get the single
- return IsTeamMaster(global_tid);
-}
-
-EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
- PRINT0(LD_IO, "call kmpc_end_single\n");
- // decide to implement single with master: master get the single
- ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
- // sync barrier is explicitly called... so that is not a problem
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Flush
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_flush(kmp_Ident *loc) {
- PRINT0(LD_IO, "call kmpc_flush\n");
- __kmpc_impl_threadfence();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Vote
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN uint64_t __kmpc_warp_active_thread_mask(void) {
- PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
- return __kmpc_impl_activemask();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Syncwarp
-////////////////////////////////////////////////////////////////////////////////
-
-EXTERN void __kmpc_syncwarp(uint64_t Mask) {
- PRINT0(LD_IO, "call __kmpc_syncwarp\n");
- __kmpc_impl_syncwarp(Mask);
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu
deleted file mode 100644
index 3c6020c5d6c23..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/src/task.cu
+++ /dev/null
@@ -1,219 +0,0 @@
-//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Task implementation support.
-//
-// explicit task structure uses
-// omptarget_nvptx task
-// kmp_task
-//
-// where kmp_task is
-// - klegacy_TaskDescr <- task pointer
-// shared -> X
-// routine
-// part_id
-// descr
-// - private (of size given by task_alloc call). Accessed by
-// task+sizeof(klegacy_TaskDescr)
-// * private data *
-// - shared: X. Accessed by shared ptr in klegacy_TaskDescr
-// * pointer table to shared variables *
-// - end
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/omptarget.h"
-
-EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
- kmp_Ident *loc, // unused
- uint32_t global_tid, // unused
- int32_t flag, // unused (because in our impl, all are immediately exec
- size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
- kmp_TaskFctPtr taskSub) {
- PRINT(LD_IO,
- "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
- "fct 0x%llx)\n",
- (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable,
- (unsigned long long)taskSub);
- // want task+priv to be a multiple of 8 bytes
- size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
- sizeOfTaskInclPrivate += padForTaskInclPriv;
- size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
- ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
- "need task descr of size %d to be a multiple of %d\n",
- (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *));
- size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
- omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
- (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
- totSize, "explicit task descriptor");
- kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
- ASSERT0(LT_FUSSY,
- (uint64_t)newKmpTaskDescr ==
- (uint64_t)ADD_BYTES(newExplicitTaskDescr,
- sizeof(omptarget_nvptx_TaskDescr)),
- "bad size assumptions");
- // init kmp_TaskDescr
- newKmpTaskDescr->sharedPointerTable =
- (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
- newKmpTaskDescr->sub = taskSub;
- newKmpTaskDescr->destructors = NULL;
- PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
- (unsigned long long)newKmpTaskDescr,
- (unsigned long long)newExplicitTaskDescr);
-
- return newKmpTaskDescr;
-}
-
-EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newKmpTaskDescr) {
- return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0,
- 0);
-}
-
-EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newKmpTaskDescr,
- int32_t depNum, void *depList,
- int32_t noAliasDepNum,
- void *noAliasDepList) {
- PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
- P64(newKmpTaskDescr));
- ASSERT0(LT_FUSSY, isRuntimeInitialized(),
- "Runtime must be initialized.");
- // 1. get explicit task descr from kmp task descr
- omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
- (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
- newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
- ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
- "bad assumptions");
- omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
- ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
- "bad assumptions");
-
- // 2. push new context: update new task descriptor
- int tid = GetLogicalThreadIdInBlock();
- omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
- newTaskDescr->CopyForExplicitTask(parentTaskDescr);
- // set new task descriptor as top
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
-
- // 3. call sub
- PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
- (unsigned long long)newKmpTaskDescr->sub,
- (unsigned long long)newKmpTaskDescr);
- newKmpTaskDescr->sub(0, newKmpTaskDescr);
- PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
- (unsigned long long)newKmpTaskDescr->sub);
-
- // 4. pop context
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
- parentTaskDescr);
- // 5. free
- SafeFree(newExplicitTaskDescr, "explicit task descriptor");
- return 0;
-}
-
-EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newKmpTaskDescr) {
- PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
- (unsigned long long)newKmpTaskDescr);
- ASSERT0(LT_FUSSY, isRuntimeInitialized(),
- "Runtime must be initialized.");
- // 1. get explicit task descr from kmp task descr
- omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
- (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
- newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
- ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
- "bad assumptions");
- omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
- ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
- "bad assumptions");
-
- // 2. push new context: update new task descriptor
- int tid = GetLogicalThreadIdInBlock();
- omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
- newTaskDescr->CopyForExplicitTask(parentTaskDescr);
- // set new task descriptor as top
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
- // 3... noting to call... is inline
- // 4 & 5 ... done in complete
-}
-
-EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newKmpTaskDescr) {
- PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
- (unsigned long long)newKmpTaskDescr);
- ASSERT0(LT_FUSSY, isRuntimeInitialized(),
- "Runtime must be initialized.");
- // 1. get explicit task descr from kmp task descr
- omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
- (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
- newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
- ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
- "bad assumptions");
- omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
- ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
- "bad assumptions");
- // 2. get parent
- omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
- // 3... noting to call... is inline
- // 4. pop context
- int tid = GetLogicalThreadIdInBlock();
- omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
- parentTaskDescr);
- // 5. free
- SafeFree(newExplicitTaskDescr, "explicit task descriptor");
-}
-
-EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
- int32_t depNum, void *depList,
- int32_t noAliasDepNum, void *noAliasDepList) {
- PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
- // nothing to do as all our tasks are executed as final
-}
-
-EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
- PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
- // nothing to do as all our tasks are executed as final
-}
-
-EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
- PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
- // nothing to do as all our tasks are executed as final
-}
-
-EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
- int end_part) {
- PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
- // do nothing: tasks are executed immediately, no yielding allowed
- return 0;
-}
-
-EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) {
- PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
- // nothing to do as all our tasks are executed as final
- return 0;
-}
-
-EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newKmpTaskDescr, int if_val,
- uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
- int32_t sched, uint64_t grainsize, void *task_dup) {
-
- // skip task entirely if empty iteration space
- if (*lb > *ub)
- return;
-
- // the compiler has already stored lb and ub in the kmp_TaskDescr structure
- // as we are using a single task to execute the entire loop, we can leave
- // the initial task_t untouched
-
- __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/common/state-queue.h b/openmp/libomptarget/deviceRTLs/common/state-queue.h
deleted file mode 100644
index 8320929cfaf3a..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/state-queue.h
+++ /dev/null
@@ -1,51 +0,0 @@
-//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a queue to hand out OpenMP state objects to teams of
-// one or more kernels.
-//
-// Reference:
-// Thomas R.W. Scogland and Wu-chun Feng. 2015.
-// Design and Evaluation of Scalable Concurrent Queues for Many-Core
-// Architectures. International Conference on Performance Engineering.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __STATE_QUEUE_H
-#define __STATE_QUEUE_H
-
-#include <stdint.h>
-
-#include "target_impl.h"
-
-template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
-private:
- ElementType elements[SIZE];
- volatile ElementType *elementQueue[SIZE];
- volatile uint32_t head;
- volatile uint32_t ids[SIZE];
- volatile uint32_t tail;
-
- static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
- INLINE uint32_t ENQUEUE_TICKET();
- INLINE uint32_t DEQUEUE_TICKET();
- INLINE static uint32_t ID(uint32_t ticket);
- INLINE bool IsServing(uint32_t slot, uint32_t id);
- INLINE void PushElement(uint32_t slot, ElementType *element);
- INLINE ElementType *PopElement(uint32_t slot);
- INLINE void DoneServing(uint32_t slot, uint32_t id);
-
-public:
- INLINE omptarget_nvptx_Queue() {}
- INLINE void Enqueue(ElementType *element);
- INLINE ElementType *Dequeue();
-};
-
-#include "state-queuei.h"
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/common/state-queuei.h b/openmp/libomptarget/deviceRTLs/common/state-queuei.h
deleted file mode 100644
index 50efb81a6d656..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h
+++ /dev/null
@@ -1,88 +0,0 @@
-//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of a queue to hand out OpenMP state
-// objects to teams of one or more kernels.
-//
-// Reference:
-// Thomas R.W. Scogland and Wu-chun Feng. 2015.
-// Design and Evaluation of Scalable Concurrent Queues for Many-Core
-// Architectures. International Conference on Performance Engineering.
-//
-//===----------------------------------------------------------------------===//
-
-#include "state-queue.h"
-
-template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
- return __kmpc_atomic_add((unsigned int *)&tail, 1u);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
- return __kmpc_atomic_add((unsigned int *)&head, 1u);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
- return (ticket / SIZE) * 2;
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
- uint32_t id) {
- return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id;
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE void
-omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
- ElementType *element) {
- __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot],
- (unsigned long long)element);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE ElementType *
-omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
- return (ElementType *)__kmpc_atomic_add(
- (unsigned long long *)&elementQueue[slot], (unsigned long long)0);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
- uint32_t id) {
- __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE void
-omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) {
- uint32_t ticket = ENQUEUE_TICKET();
- uint32_t slot = ticket % SIZE;
- uint32_t id = ID(ticket) + 1;
- while (!IsServing(slot, id))
- ;
- PushElement(slot, element);
- DoneServing(slot, id);
-}
-
-template <typename ElementType, uint32_t SIZE>
-INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
- uint32_t ticket = DEQUEUE_TICKET();
- uint32_t slot = ticket % SIZE;
- uint32_t id = ID(ticket);
- while (!IsServing(slot, id))
- ;
- ElementType *element = PopElement(slot);
- // This is to populate the queue because of the lack of GPU constructors.
- if (element == 0)
- element = &elements[slot];
- DoneServing(slot, id);
- return element;
-}
diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
deleted file mode 100644
index dfaf08339e894..0000000000000
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Wrapper to some functions natively supported by the GPU.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef OMPTARGET_SUPPORT_H
-#define OMPTARGET_SUPPORT_H
-
-#include "interface.h"
-#include "target_impl.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Parameters
-////////////////////////////////////////////////////////////////////////////////
-enum OMPTgtExecModeFlags : int8_t {
- OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
- OMP_TGT_EXEC_MODE_SPMD = 1 << 1
-};
-
-enum OMPTgtRuntimeModeFlags : int8_t {
- OMP_TGT_RUNTIME_UNINITIALIZED = 0,
- OMP_TGT_RUNTIME_INITIALIZED = 1 << 2
-};
-
-void setExecutionParameters(OMPTgtExecModeFlags EMode,
- OMPTgtRuntimeModeFlags RMode);
-bool isGenericMode();
-bool isRuntimeUninitialized();
-bool isRuntimeInitialized();
-
-////////////////////////////////////////////////////////////////////////////////
-// get info from machine
-////////////////////////////////////////////////////////////////////////////////
-
-// get global ids to locate tread/team info (constant regardless of OMP)
-int GetLogicalThreadIdInBlock();
-int GetMasterThreadID();
-int GetNumberOfWorkersInTeam();
-
-// get OpenMP thread and team ids
-int GetOmpThreadId(); // omp_thread_num
-int GetOmpTeamId(); // omp_team_num
-
-// get OpenMP number of threads and team
-int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-int GetNumberOfOmpTeams(); // omp_num_teams
-
-// get OpenMP number of procs
-int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
-
-// masters
-int IsTeamMaster(int ompThreadId);
-
-// Parallel level
-void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory
-////////////////////////////////////////////////////////////////////////////////
-
-// safe alloc and free
-void *SafeMalloc(size_t size, const char *msg); // check if success
-void *SafeFree(void *ptr, const char *msg);
-// pad to a alignment (power of 2 only)
-unsigned long PadBytes(unsigned long size, unsigned long alignment);
-#define ADD_BYTES(_addr, _bytes) \
- ((void *)((char *)((void *)(_addr)) + (_bytes)))
-#define SUB_BYTES(_addr, _bytes) \
- ((void *)((char *)((void *)(_addr)) - (_bytes)))
-
-////////////////////////////////////////////////////////////////////////////////
-// Teams Reduction Scratchpad Helpers
-////////////////////////////////////////////////////////////////////////////////
-unsigned int *GetTeamsReductionTimestamp();
-char *GetTeamsReductionScratchpad();
-
-// Invoke an outlined parallel function unwrapping global, shared arguments (up
-// to 128).
-void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn,
- void **args, size_t nargs);
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
deleted file mode 100644
index b9b537cf1d060..0000000000000
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ /dev/null
@@ -1,505 +0,0 @@
-//===------- interface.h - OpenMP interface definitions ---------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains all the definitions that are relevant to
-// the interface. The first section contains the interface as
-// declared by OpenMP. The second section includes the compiler
-// specific interfaces.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _INTERFACES_H_
-#define _INTERFACES_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __AMDGCN__
-#include "amdgcn/src/amdgcn_interface.h"
-#endif
-#ifdef __CUDACC__
-#include "nvptx/src/nvptx_interface.h"
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// OpenMP interface
-////////////////////////////////////////////////////////////////////////////////
-
-typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
-
-typedef enum omp_sched_t {
- omp_sched_static = 1, /* chunkSize >0 */
- omp_sched_dynamic = 2, /* chunkSize >0 */
- omp_sched_guided = 3, /* chunkSize >0 */
- omp_sched_auto = 4, /* no chunkSize */
-} omp_sched_t;
-
-typedef enum omp_proc_bind_t {
- omp_proc_bind_false = 0,
- omp_proc_bind_true = 1,
- omp_proc_bind_master = 2,
- omp_proc_bind_close = 3,
- omp_proc_bind_spread = 4
-} omp_proc_bind_t;
-
-EXTERN double omp_get_wtick(void);
-EXTERN double omp_get_wtime(void);
-
-EXTERN void omp_set_num_threads(int num);
-EXTERN int omp_get_num_threads(void);
-EXTERN int omp_get_max_threads(void);
-EXTERN int omp_get_thread_limit(void);
-EXTERN int omp_get_thread_num(void);
-EXTERN int omp_get_num_procs(void);
-EXTERN int omp_in_parallel(void);
-EXTERN int omp_in_final(void);
-EXTERN void omp_set_dynamic(int flag);
-EXTERN int omp_get_dynamic(void);
-EXTERN void omp_set_nested(int flag);
-EXTERN int omp_get_nested(void);
-EXTERN void omp_set_max_active_levels(int level);
-EXTERN int omp_get_max_active_levels(void);
-EXTERN int omp_get_level(void);
-EXTERN int omp_get_active_level(void);
-EXTERN int omp_get_ancestor_thread_num(int level);
-EXTERN int omp_get_team_size(int level);
-
-EXTERN void omp_init_lock(omp_lock_t *lock);
-EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
-EXTERN void omp_destroy_lock(omp_lock_t *lock);
-EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
-EXTERN void omp_set_lock(omp_lock_t *lock);
-EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
-EXTERN void omp_unset_lock(omp_lock_t *lock);
-EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
-EXTERN int omp_test_lock(omp_lock_t *lock);
-EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
-
-EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
-EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
-EXTERN omp_proc_bind_t omp_get_proc_bind(void);
-EXTERN int omp_get_cancellation(void);
-EXTERN void omp_set_default_device(int deviceId);
-EXTERN int omp_get_default_device(void);
-EXTERN int omp_get_num_devices(void);
-EXTERN int omp_get_num_teams(void);
-EXTERN int omp_get_team_num(void);
-EXTERN int omp_get_initial_device(void);
-EXTERN int omp_get_max_task_priority(void);
-
-EXTERN void *llvm_omp_get_dynamic_shared();
-
-////////////////////////////////////////////////////////////////////////////////
-// file below is swiped from kmpc host interface
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// kmp specific types
-////////////////////////////////////////////////////////////////////////////////
-
-typedef enum kmp_sched_t {
- kmp_sched_static_chunk = 33,
- kmp_sched_static_nochunk = 34,
- kmp_sched_dynamic = 35,
- kmp_sched_guided = 36,
- kmp_sched_runtime = 37,
- kmp_sched_auto = 38,
-
- kmp_sched_static_balanced_chunk = 45,
-
- kmp_sched_static_ordered = 65,
- kmp_sched_static_nochunk_ordered = 66,
- kmp_sched_dynamic_ordered = 67,
- kmp_sched_guided_ordered = 68,
- kmp_sched_runtime_ordered = 69,
- kmp_sched_auto_ordered = 70,
-
- kmp_sched_distr_static_chunk = 91,
- kmp_sched_distr_static_nochunk = 92,
- kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
-
- kmp_sched_default = kmp_sched_static_nochunk,
- kmp_sched_unordered_first = kmp_sched_static_chunk,
- kmp_sched_unordered_last = kmp_sched_auto,
- kmp_sched_ordered_first = kmp_sched_static_ordered,
- kmp_sched_ordered_last = kmp_sched_auto_ordered,
- kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
- kmp_sched_distribute_last =
- kmp_sched_distr_static_chunk_sched_static_chunkone,
-
- /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
- * Since we need to distinguish the three possible cases (no modifier,
- * monotonic modifier, nonmonotonic modifier), we need separate bits for
- * each modifier. The absence of monotonic does not imply nonmonotonic,
- * especially since 4.5 says that the behaviour of the "no modifier" case
- * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
- *
- * Since we're passing a full 32 bit value, we can use a couple of high
- * bits for these flags; out of paranoia we avoid the sign bit.
- *
- * These modifiers can be or-ed into non-static schedules by the compiler
- * to pass the additional information. They will be stripped early in the
- * processing in __kmp_dispatch_init when setting up schedules, so
- * most of the code won't ever see schedules with these bits set.
- */
- kmp_sched_modifier_monotonic = (1 << 29),
- /**< Set if the monotonic schedule modifier was present */
- kmp_sched_modifier_nonmonotonic = (1 << 30),
-/**< Set if the nonmonotonic schedule modifier was present */
-
-#define SCHEDULE_WITHOUT_MODIFIERS(s) \
- (enum kmp_sched_t)( \
- (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
-#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
-#define SCHEDULE_HAS_NONMONOTONIC(s) \
- (((s)&kmp_sched_modifier_nonmonotonic) != 0)
-#define SCHEDULE_HAS_NO_MODIFIERS(s) \
- (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
- 0)
-
-} kmp_sched_t;
-
-/*!
- * Enum for accesseing the reserved_2 field of the ident_t struct below.
- */
-enum {
- /*! Bit set to 1 when in SPMD mode. */
- KMP_IDENT_SPMD_MODE = 0x01,
- /*! Bit set to 1 when a simplified runtime is used. */
- KMP_IDENT_SIMPLE_RT_MODE = 0x02,
-};
-
-/*!
- * The ident structure that describes a source location.
- * The struct is identical to the one in the kmp.h file.
- * We maintain the same data structure for compatibility.
- */
-typedef short kmp_int16;
-typedef int kmp_int32;
-typedef struct ident {
- kmp_int32 reserved_1; /**< might be used in Fortran; see above */
- kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
- identifies this union member */
- kmp_int32 reserved_2; /**< not really used in Fortran any more; see above */
- kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++ */
- char const *psource; /**< String describing the source location.
- The string is composed of semi-colon separated fields
- which describe the source file, the function and a pair
- of line numbers that delimit the construct. */
-} ident_t;
-
-// parallel defs
-typedef ident_t kmp_Ident;
-typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
-typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
- int16_t lane_offset,
- int16_t shortCircuit);
-typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
-
-// task defs
-typedef struct kmp_TaskDescr kmp_TaskDescr;
-typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
-typedef struct kmp_TaskDescr {
- void *sharedPointerTable; // ptr to a table of shared var ptrs
- kmp_TaskFctPtr sub; // task subroutine
- int32_t partId; // unused
- kmp_TaskFctPtr destructors; // destructor of c++ first private
-} kmp_TaskDescr;
-
-// sync defs
-typedef int32_t kmp_CriticalName[8];
-
-////////////////////////////////////////////////////////////////////////////////
-// external interface
-////////////////////////////////////////////////////////////////////////////////
-
-// parallel
-EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
-NOINLINE EXTERN uint8_t __kmpc_parallel_level();
-
-// proc bind
-EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid,
- int proc_bind);
-EXTERN int omp_get_num_places(void);
-EXTERN int omp_get_place_num_procs(int place_num);
-EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
-EXTERN int omp_get_place_num(void);
-EXTERN int omp_get_partition_num_places(void);
-EXTERN void omp_get_partition_place_nums(int *place_nums);
-
-// for static (no chunk or chunk)
-EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk);
-EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk);
-EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk);
-EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter1,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk);
-// distribute static (no chunk or chunk)
-EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk);
-EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk);
-EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk);
-EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter1,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter1,
- uint64_t *plower, uint64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- int32_t *plower, int32_t *pupper,
- int32_t *pstride, int32_t incr,
- int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_4u_simple_generic(
- kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
- uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
- int32_t chunk);
-EXTERN
-void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t *plastiter,
- int64_t *plower, int64_t *pupper,
- int64_t *pstride, int64_t incr,
- int64_t chunk);
-EXTERN
-void __kmpc_for_static_init_8u_simple_generic(
- kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
- uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
- int64_t chunk);
-
-EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid);
-
-EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid);
-
-// for dynamic
-EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int32_t lower, int32_t upper,
- int32_t incr, int32_t chunk);
-EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, uint32_t lower,
- uint32_t upper, int32_t incr,
- int32_t chunk);
-EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, int64_t lower, int64_t upper,
- int64_t incr, int64_t chunk);
-EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid,
- int32_t sched, uint64_t lower,
- uint64_t upper, int64_t incr,
- int64_t chunk);
-
-EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid,
- int32_t *plastiter, int32_t *plower,
- int32_t *pupper, int32_t *pstride);
-EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid,
- int32_t *plastiter, uint32_t *plower,
- uint32_t *pupper, int32_t *pstride);
-EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid,
- int32_t *plastiter, int64_t *plower,
- int64_t *pupper, int64_t *pstride);
-EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid,
- int32_t *plastiter, uint64_t *plower,
- uint64_t *pupper, int64_t *pstride);
-
-EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
-
-// reduction
-EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
-EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
- kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
- void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
- kmp_InterWarpCopyFctPtr cpyFct);
-EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
- kmp_Ident *loc, int32_t global_tid, void *global_buffer,
- int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
- kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
- kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
- kmp_ListGlobalFctPtr glredFct);
-EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
-EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
-
-// sync barrier
-EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
-EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
-EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
-EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
-
-// single
-EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid);
-
-// sync
-EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid);
-EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
- kmp_CriticalName *crit);
-EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
- kmp_CriticalName *crit);
-EXTERN void __kmpc_flush(kmp_Ident *loc);
-
-// vote
-EXTERN uint64_t __kmpc_warp_active_thread_mask(void);
-// syncwarp
-EXTERN void __kmpc_syncwarp(uint64_t);
-
-// tasks
-EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid,
- int32_t flag,
- size_t sizeOfTaskInclPrivate,
- size_t sizeOfSharedTable,
- kmp_TaskFctPtr sub);
-EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newLegacyTaskDescr);
-EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newLegacyTaskDescr,
- int32_t depNum, void *depList,
- int32_t noAliasDepNum,
- void *noAliasDepList);
-EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newLegacyTaskDescr);
-EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newLegacyTaskDescr);
-EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
- int32_t depNum, void *depList,
- int32_t noAliasDepNum, void *noAliasDepList);
-EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid);
-EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid);
-EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
- int end_part);
-EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid);
-EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
- kmp_TaskDescr *newKmpTaskDescr, int if_val,
- uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
- int32_t sched, uint64_t grainsize, void *task_dup);
-
-// cancel
-EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
- int32_t cancelVal);
-EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
- int32_t cancelVal);
-
-// non standard
-EXTERN int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
- bool UseGenericStateMachine,
- bool RequiresFullRuntime);
-EXTERN void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
- bool RequiresFullRuntime);
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
- int32_t NumThreadsClause);
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
-EXTERN void __kmpc_kernel_end_parallel();
-
-EXTERN void __kmpc_data_sharing_init_stack();
-EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
-EXTERN void __kmpc_end_sharing_variables();
-EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
-
-/// Entry point to start a new parallel region.
-///
-/// \param ident The source identifier.
-/// \param global_tid The global thread ID.
-/// \param if_expr The if(expr), or 1 if none given.
-/// \param num_threads The num_threads(expr), or -1 if none given.
-/// \param proc_bind The proc_bind, or `proc_bind_default` if none given.
-/// \param fn The outlined parallel region function.
-/// \param wrapper_fn The worker wrapper function of fn.
-/// \param args The pointer array of arguments to fn.
-/// \param nargs The number of arguments to fn.
-NOINLINE EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid,
- kmp_int32 if_expr,
- kmp_int32 num_threads, int proc_bind,
- void *fn, void *wrapper_fn, void **args,
- size_t nargs);
-
-// SPMD execution mode interrogation function.
-EXTERN int8_t __kmpc_is_spmd_exec_mode();
-
-/// Return true if the hardware thread id \p Tid represents the OpenMP main
-/// thread in generic mode outside of a parallel region.
-EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid);
-
-/// Return true if the hardware thread id \p Tid represents the OpenMP main
-/// thread in generic mode.
-EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid);
-
-EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
- const void *buf, size_t size,
- int16_t is_shared, const void **res);
-
-EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
- int16_t is_shared);
-
-/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
-/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
-/// called by any thread, allocation happens per-thread.
-EXTERN void *__kmpc_alloc_shared(uint64_t Bytes);
-
-/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
-/// a stack (push/pop). Can be called by any thread. \p Ptr must be allocated by
-/// __kmpc_alloc_shared by the same thread. \p Bytes contains the size of the
-/// paired allocation to make memory management easier.
-EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes);
-
-/// Get a pointer to the dynamic shared memory buffer in the device.
-EXTERN void *__kmpc_get_dynamic_shared();
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
deleted file mode 100644
index 9e5263e33a4a8..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ /dev/null
@@ -1,257 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
-#
-##===----------------------------------------------------------------------===##
-
-# By default we will build NVPTX deviceRTL on a CUDA free system
-set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL
- "Whether build NVPTX deviceRTL on CUDA free system.")
-
-if (NOT LIBOMPTARGET_BUILD_NVPTX_BCLIB)
- libomptarget_say("Not building NVPTX deviceRTL: Disabled by LIBOMPTARGET_BUILD_NVPTX_BCLIB")
- return()
-endif()
-
-if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
- libomptarget_say("Not building NVPTX device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
- return()
-endif()
-
-# Check if we can create an LLVM bitcode implementation of the runtime library
-# that could be inlined in the user application. For that we need to find
-# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
-# an LLVM linker.
-set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
- "Location of a CUDA compiler capable of emitting LLVM bitcode.")
-set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
- "Location of a linker capable of linking LLVM bitcode objects.")
-
-if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
- set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
-elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING)
- # Compile the deviceRTL with the clang that is built in the project.
- set(cuda_compiler "$<TARGET_FILE:clang>")
-elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
- # Compile the device runtime with the compiler that OpenMP is built with.
- # This is the case with LLVM_ENABLE_RUNTIMES=openmp.
- # FIXME: This is unreliable; the compiler can be on older version of clang
- # that does not support compiling CUDA, or only an older version of it. The
- # risk is especially high on sytems where clang is the default compiler
- # (MacOS, BSDs). LLVM_ENABLE_RUNTIMES=openmp should itself set
- # LIBOMPTARGET_NVPTX_CUDA_COMPILER instead.
- set(cuda_compiler ${CMAKE_C_COMPILER})
-else()
- libomptarget_say("Not building NVPTX deviceRTL: clang not found")
- return()
-endif()
-
-# Get compiler directory to try to locate a suitable linker.
-get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
-set(llvm_link "${compiler_dir}/llvm-link")
-
-if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
- set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
-elseif (EXISTS ${llvm_link})
- # Try to use the linker consistent with the CUDA compiler unless explicitly
- # set to a
diff erent linker.
- set(bc_linker ${llvm_link})
-elseif (NOT OPENMP_STANDALONE_BUILD AND NOT CMAKE_CROSSCOMPILING)
- # Use the linker also built in the same project.
- set(bc_linker "$<TARGET_FILE:llvm-link>")
-else()
- libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
- return()
-endif()
-
-# TODO: This part needs to be refined when libomptarget is going to support
-# Windows!
-# TODO: This part can also be removed if we can change the clang driver to make
-# it support device only compilation.
-if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
- set(aux_triple x86_64-unknown-linux-gnu)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
- set(aux_triple powerpc64le-unknown-linux-gnu)
-elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
- set(aux_triple aarch64-unknown-linux-gnu)
-else()
- libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
- return()
-endif()
-
-get_filename_component(devicertl_base_directory
- ${CMAKE_CURRENT_SOURCE_DIR}
- DIRECTORY)
-set(devicertl_common_directory
- ${devicertl_base_directory}/common)
-set(devicertl_nvptx_directory
- ${devicertl_base_directory}/nvptx)
-
-set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80 86)
-
-set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING
- "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
-string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES)
-
-if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all")
- set(nvptx_sm_list ${all_capabilities})
-elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto")
- if (NOT LIBOMPTARGET_DEP_CUDA_FOUND)
- libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.")
- endif()
- set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH})
-else()
- string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}")
-endif()
-
-# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
-# build.
-if (NOT nvptx_sm_list)
- libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
- return()
-endif()
-
-# Check all SM values
-foreach(sm ${nvptx_sm_list})
- if (NOT ${sm} IN_LIST all_capabilities)
- libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.")
- endif()
-endforeach()
-
-# Override default MAX_SM in src/target_impl.h if requested
-if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
- set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
-endif()
-
-# Activate RTL message dumps if requested by the user.
-set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
- "Activate NVPTX device RTL debug messages.")
-
-if ("${cuda_compiler}" STREQUAL "$<TARGET_FILE:clang>")
- libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using in-tree clang.")
-else ()
- libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using ${cuda_compiler}")
-endif ()
-
-set(cuda_src_files
- ${devicertl_common_directory}/src/cancel.cu
- ${devicertl_common_directory}/src/critical.cu
- ${devicertl_common_directory}/src/data_sharing.cu
- ${devicertl_common_directory}/src/libcall.cu
- ${devicertl_common_directory}/src/loop.cu
- ${devicertl_common_directory}/src/omp_data.cu
- ${devicertl_common_directory}/src/omptarget.cu
- ${devicertl_common_directory}/src/parallel.cu
- ${devicertl_common_directory}/src/reduction.cu
- ${devicertl_common_directory}/src/support.cu
- ${devicertl_common_directory}/src/sync.cu
- ${devicertl_common_directory}/src/task.cu
- ${devicertl_common_directory}/src/shuffle.cpp
- src/target_impl.cu
-)
-
-# Prepend -I to each list element
-set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
-list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX PREPEND "-I")
-
-# Set flags for LLVM Bitcode compilation.
-set(bc_flags -S -x c++ -O1 -std=c++14
- -mllvm -openmp-opt-disable
- -ffreestanding
- -target nvptx64
- -fvisibility=hidden
- -Xclang -emit-llvm-bc
- -Xclang -aux-triple -Xclang ${aux_triple}
- -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
- -Xclang -target-feature -Xclang +ptx61
- -D__CUDACC__
- -I${devicertl_base_directory}
- -I${devicertl_common_directory}/include
- -I${devicertl_nvptx_directory}/src
- -I${devicertl_base_directory}/../include
- ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX})
-
-if(${LIBOMPTARGET_NVPTX_DEBUG})
- list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1 -g)
-else()
- list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
-endif()
-
-# Create target to build all Bitcode libraries.
-add_custom_target(omptarget-nvptx-bc)
-
-# Generate a Bitcode library for all the compute capabilities the user requested
-foreach(sm ${nvptx_sm_list})
- set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
- set(bc_files "")
- foreach(src ${cuda_src_files})
- get_filename_component(infile ${src} ABSOLUTE)
- get_filename_component(outfile ${src} NAME)
- set(outfile "${outfile}-sm_${sm}.bc")
-
- add_custom_command(OUTPUT ${outfile}
- COMMAND ${cuda_compiler} ${bc_flags}
- ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
- DEPENDS ${infile}
- IMPLICIT_DEPENDS CXX ${infile}
- COMMENT "Building LLVM bitcode ${outfile}"
- VERBATIM
- )
- if("${cuda_compiler}" STREQUAL "$<TARGET_FILE:clang>")
- # Add a file-level dependency to ensure that clang is up-to-date.
- # By default, add_custom_command only builds clang if the
- # executable is missing.
- add_custom_command(OUTPUT ${outfile}
- DEPENDS clang
- APPEND
- )
- endif()
- set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
- list(APPEND bc_files ${outfile})
- endforeach()
-
- set(bclib_name "libomptarget-nvptx-sm_${sm}.bc")
-
- # Link to a bitcode library.
- add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
- COMMAND ${bc_linker}
- -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
- DEPENDS ${bc_files}
- COMMENT "Linking LLVM bitcode ${bclib_name}"
- )
- if("${bc_linker}" STREQUAL "$<TARGET_FILE:llvm-link>")
- # Add a file-level dependency to ensure that llvm-link is up-to-date.
- # By default, add_custom_command only builds llvm-link if the
- # executable is missing.
- add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
- DEPENDS llvm-link
- APPEND
- )
- endif()
- set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
-
- set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc")
-
- add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
- add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
-
- # Copy library to destination.
- add_custom_command(TARGET ${bclib_target_name} POST_BUILD
- COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
- ${LIBOMPTARGET_LIBRARY_DIR})
-
- # Install bitcode library under the lib destination folder.
- install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-endforeach()
-
-# Test will be enabled if the building machine supports CUDA
-if (LIBOMPTARGET_DEP_CUDA_FOUND)
- add_subdirectory(test)
-endif()
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt b/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
deleted file mode 100644
index 4149dfacb62ad..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
+++ /dev/null
@@ -1,523 +0,0 @@
-
-**Design document for OpenMP reductions on the GPU**
-
-//Abstract: //In this document we summarize the new design for an OpenMP
-implementation of reductions on NVIDIA GPUs. This document comprises
-* a succinct background review,
-* an introduction to the decoupling of reduction algorithm and
- data-structure-specific processing routines,
-* detailed illustrations of reduction algorithms used and
-* a brief overview of steps we have made beyond the last implementation.
-
-**Problem Review**
-
-Consider a typical OpenMP program with reduction pragma.
-
-```
- double foo, bar;
- #pragma omp parallel for reduction(+:foo, bar)
- for (int i = 0; i < N; i++) {
- foo+=A[i]; bar+=B[i];
- }
-```
-where 'foo' and 'bar' are reduced across all threads in the parallel region.
-Our primary goal is to efficiently aggregate the values of foo and bar in
-such manner that
-* makes the compiler logically concise.
-* efficiently reduces within warps, threads, blocks and the device.
-
-**Introduction to Decoupling**
-In this section we address the problem of making the compiler
-//logically concise// by partitioning the task of reduction into two broad
-categories: data-structure specific routines and algorithmic routines.
-
-The previous reduction implementation was highly coupled with
-the specificity of the reduction element data structures (e.g., sizes, data
-types) and operators of the reduction (e.g., addition, multiplication). In
-our implementation we strive to decouple them. In our final implementations,
-we could remove all template functions in our runtime system.
-
-The (simplified) pseudo code generated by LLVM is as follows:
-
-```
- 1. Create private copies of variables: foo_p, bar_p
- 2. Each thread reduces the chunk of A and B assigned to it and writes
- to foo_p and bar_p respectively.
- 3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn,
- interWarpCpyFn)
- where:
- struct ReduceData {
- double *foo;
- double *bar;
- } reduceData
- reduceData.foo = &foo_p
- reduceData.bar = &bar_p
-
- shuffleReduceFn and interWarpCpyFn are two auxiliary functions
- generated to aid the runtime performing algorithmic steps
- while being data-structure agnostic about ReduceData.
-
- In particular, shuffleReduceFn is a function that takes the following
- inputs:
- a. local copy of ReduceData
- b. its lane_id
- c. the offset of the lane_id which hosts a remote ReduceData
- relative to the current one
- d. an algorithm version parameter determining which reduction
- algorithm to use.
- This shuffleReduceFn retrieves the remote ReduceData through shuffle
- intrinsics and reduces, using the algorithm specified by the 4th
- parameter, the local ReduceData and with the remote ReduceData element
- wise, and places the resultant values into the local ReduceData.
-
- Different reduction algorithms are implemented with
diff erent runtime
- functions, but they all make calls to this same shuffleReduceFn to
- perform the essential reduction step. Therefore, based on the 4th
- parameter, this shuffleReduceFn will behave slightly
diff erently to
- cooperate with the runtime function to ensure correctness under
-
diff erent circumstances.
-
- InterWarpCpyFn, as the name suggests, is a function that copies data
- across warps. Its function is to tunnel all the thread private
- ReduceData that is already reduced within a warp to a lane in the first
- warp with minimal shared memory footprint. This is an essential step to
- prepare for the last step of a block reduction.
-
- (Warp, block, device level reduction routines that utilize these
- auxiliary functions will be discussed in the next section.)
-
- 4. if ret == 1:
- The master thread stores the reduced result in the globals.
- foo += reduceData.foo; bar += reduceData.bar
-```
-
-**Reduction Algorithms**
-
-On the warp level, we have three versions of the algorithms:
-
-1. Full Warp Reduction
-
-```
-gpu_regular_warp_reduce(void *reduce_data,
- kmp_ShuffleReductFctPtr ShuffleReduceFn) {
- for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
- ShuffleReduceFn(reduce_data, 0, offset, 0);
-}
-```
-ShuffleReduceFn is used here with lane_id set to 0 because it is not used
-therefore we save instructions by not retrieving lane_id from the corresponding
-special registers. The 4th parameters, which represents the version of the
-algorithm being used here, is set to 0 to signify full warp reduction.
-
-In this version specified (=0), the ShuffleReduceFn behaves, per element, as
-follows:
-
-```
-//reduce_elem refers to an element in the local ReduceData
-//remote_elem is retrieved from a remote lane
-remote_elem = shuffle_down(reduce_elem, offset, 32);
-reduce_elem = reduce_elem @ remote_elem;
-
-```
-
-An illustration of this algorithm operating on a hypothetical 8-lane full-warp
-would be:
-{F74}
-The coloring invariant follows that elements with the same color will be
-combined and reduced in the next reduction step. As can be observed, no overhead
-is present, exactly log(2, N) steps are needed.
-
-2. Contiguous Full Warp Reduction
-```
-gpu_irregular_warp_reduce(void *reduce_data,
- kmp_ShuffleReductFctPtr ShuffleReduceFn, int size,
- int lane_id) {
- int curr_size;
- int offset;
- curr_size = size;
- mask = curr_size/2;
- while (offset>0) {
- ShuffleReduceFn(reduce_data, lane_id, offset, 1);
- curr_size = (curr_size+1)/2;
- offset = curr_size/2;
- }
-}
-```
-
-In this version specified (=1), the ShuffleReduceFn behaves, per element, as
-follows:
-```
-//reduce_elem refers to an element in the local ReduceData
-//remote_elem is retrieved from a remote lane
-remote_elem = shuffle_down(reduce_elem, offset, 32);
-if (lane_id < offset) {
- reduce_elem = reduce_elem @ remote_elem
-} else {
- reduce_elem = remote_elem
-}
-```
-
-An important invariant (also a restriction on the starting state of the
-reduction) is that this algorithm assumes that all unused ReduceData are
-located in a contiguous subset of threads in a warp starting from lane 0.
-
-With the presence of a trailing active lane with an odd-numbered lane
-id, its value will not be aggregated with any other lane. Therefore,
-in order to preserve the invariant, such ReduceData is copied to the first lane
-whose thread-local ReduceData has already being used in a previous reduction
-and would therefore be useless otherwise.
-
-An illustration of this algorithm operating on a hypothetical 8-lane partial
-warp woud be:
-{F75}
-
-As illustrated, this version of the algorithm introduces overhead whenever
-we have odd number of participating lanes in any reduction step to
-copy data between lanes.
-
-3. Dispersed Partial Warp Reduction
-```
-gpu_irregular_simt_reduce(void *reduce_data,
- kmp_ShuffleReductFctPtr ShuffleReduceFn) {
- int size, remote_id;
- int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
- do {
- remote_id = find_the_next_active_lane_id_right_after_me();
- // the above function returns 0 of no active lane
- // is present right after the current thread.
- size = get_number_of_active_lanes_in_this_warp();
- logical_lane_id /= 2;
- ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
- } while (logical_lane_id % 2 == 0 && size > 1);
-```
-
-There is no assumption made about the initial state of the reduction.
-Any number of lanes (>=1) could be active at any position. The reduction
-result is kept in the first active lane.
-
-In this version specified (=2), the ShuffleReduceFn behaves, per element, as
-follows:
-```
-//reduce_elem refers to an element in the local ReduceData
-//remote_elem is retrieved from a remote lane
-remote_elem = shuffle_down(reduce_elem, offset, 32);
-if (LaneId % 2 == 0 && Offset > 0) {
- reduce_elem = reduce_elem @ remote_elem
-} else {
- reduce_elem = remote_elem
-}
-```
-We will proceed with a brief explanation for some arguments passed in,
-it is important to notice that, in this section, we will introduce the
-concept of logical_lane_id, and it is important to distinguish it
-from physical lane_id as defined by nvidia.
-1. //logical_lane_id//: as the name suggests, it refers to the calculated
- lane_id (instead of the physical one defined by nvidia) that would make
- our algorithm logically concise. A thread with logical_lane_id k means
- there are (k-1) threads before it.
-2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
- id of the remote lane from which we will retrieve the ReduceData. We
- subtract (threadIdx+1) from it because we would like to maintain only one
- underlying shuffle intrinsic (which is used to communicate among lanes in a
- warp). This particular version of shuffle intrinsic we take accepts only
- offsets, instead of absolute lane_id. Therefore the subtraction is performed
- on the absolute lane_id we calculated to obtain the offset.
-
-This algorithm is slightly
diff erent in 2 ways and it is not, conceptually, a
-generalization of the above algorithms.
-1. It reduces elements close to each other. For instance, values in the 0th lane
- is to be combined with that of the 1st lane; values in the 2nd lane is to be
- combined with that of the 3rd lane. We did not use the previous algorithm
- where the first half of the (partial) warp is reduced with the second half
- of the (partial) warp. This is because, the mapping
- f(x): logical_lane_id -> physical_lane_id;
- can be easily calculated whereas its inverse
- f^-1(x): physical_lane_id -> logical_lane_id
- cannot and performing such reduction requires the inverse to be known.
-2. Because this algorithm is agnostic about the positions of the lanes that are
- active, we do not need to perform the coping step as in the second
- algorithm.
-An illustrative run would look like
-{F76}
-As observed, overhead is high because in each and every step of reduction,
-logical_lane_id is recalculated; so is the remote_id.
-
-On a block level, we have implemented the following block reduce algorithm:
-
-```
-gpu_irregular_block_reduce(void *reduce_data,
- kmp_ShuffleReductFctPtr shuflReduceFn,
- kmp_InterWarpCopyFctPtr interWarpCpyFn,
- int size) {
-
- int wid = threadIdx.x/WARPSIZE;
- int lane_id = threadIdx.x%WARPSIZE;
-
- int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
-
- unsigned tnum = __ballot(1);
- int thread_num = __popc(tnum);
-
- //full warp reduction
- if (thread_num == WARPSIZE) {
- gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
- }
- //partial warp reduction
- if (thread_num < WARPSIZE) {
- gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num,
- lane_id);
- }
- //Gather all the reduced values from each warp
- //to the first warp
- //named_barrier inside this function to ensure
- //correctness. It is effectively a sync_thread
- //that won't deadlock.
- interWarpCpyFn(reduce_data, warp_needed);
-
- //This is to reduce data gathered from each "warp master".
- if (wid==0) {
- gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed,
- lane_id);
- }
-
- return;
-}
-```
-In this function, no ShuffleReduceFn is directly called as it makes calls
-to various versions of the warp-reduction functions. It first reduces
-ReduceData warp by warp; in the end, we end up with the number of
-ReduceData equal to the number of warps present in this thread
-block. We then proceed to gather all such ReduceData to the first warp.
-
-As observed, in this algorithm we make use of the function InterWarpCpyFn,
-which copies data from each of the "warp master" (0th lane of each warp, where
-a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a
-mathematical sense) the problem of reduction across warp masters in a block to
-the problem of warp reduction which we already have solutions to.
-
-We can thus completely avoid the use of atomics to reduce in a threadblock.
-
-**Efficient Cross Block Reduce**
-
-The next challenge is to reduce values across threadblocks. We aim to do this
-without atomics or critical sections.
-
-Let a kernel be started with TB threadblocks.
-Let the GPU have S SMs.
-There can be at most N active threadblocks per SM at any time.
-
-Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of
-at most 'N' active threadblocks on SM s. Let each threadblock active on an SM
-be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id)
-uniquely identifies an active threadblock on the GPU.
-
-To efficiently implement cross block reduce, we first allocate an array for
-each value to be reduced of size S*N (which is the maximum number of active
-threadblocks at any time on the device).
-
-Each threadblock reduces its value to slot [s][id]. This can be done without
-locking since no other threadblock can write to the same slot concurrently.
-
-As a final stage, we reduce the values in the array as follows:
-
-```
-// Compiler generated wrapper function for each target region with a reduction
-clause.
-target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1
- thread.
- // Use dynamic parallelism to launch M teams, N threads as requested by the
- user to execute the target region.
-
- target_function<<M, N>>(map_args)
-
- Reduce values in reduction_array
-
-```
-
-**Comparison with Last Version**
-
-
-The (simplified) pseudo code generated by LLVM on the host is as follows:
-
-
-```
- 1. Create private copies of variables: foo_p, bar_p
- 2. Each thread reduces the chunk of A and B assigned to it and writes
- to foo_p and bar_p respectively.
- 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
- where:
- struct ReduceData {
- double *foo;
- double *bar;
- } reduceData
- reduceData.foo = &foo_p
- reduceData.bar = &bar_p
-
- reduceFn is a pointer to a function that takes in two inputs
- of type ReduceData, "reduces" them element wise, and places the
- result in the first input:
- reduceFn(ReduceData *a, ReduceData *b)
- a = a @ b
-
- Every thread in the parallel region calls kmpc_reduce_nowait with
- its private copy of reduceData. The runtime reduces across the
- threads (using tree reduction on the operator 'reduceFn?) and stores
- the final result in the master thread if successful.
- 4. if ret == 1:
- The master thread stores the reduced result in the globals.
- foo += reduceData.foo; bar += reduceData.bar
- 5. else if ret == 2:
- In this case kmpc_reduce_nowait() could not use tree reduction,
- so use atomics instead:
- each thread atomically writes to foo
- each thread atomically writes to bar
-```
-
-On a GPU, a similar reduction may need to be performed across SIMT threads,
-warps, and threadblocks. The challenge is to do so efficiently in a fashion
-that is compatible with the LLVM OpenMP implementation.
-
-In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
-the salient steps of the code generated are as follows:
-
-
-```
- 1. Create private copies of variables: foo_p, bar_p
- 2. Each thread reduces the chunk of A and B assigned to it and writes
- to foo_p and bar_p respectively.
- 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
- status = can_block_reduce()
- if status == 1:
- reduce efficiently to thread 0 using shuffles and shared memory.
- return 1
- else
- cannot use efficient block reduction, fallback to atomics
- return 2
- 4. if ret == 1:
- The master thread stores the reduced result in the globals.
- foo += reduceData.foo; bar += reduceData.bar
- 5. else if ret == 2:
- In this case kmpc_reduce_nowait() could not use tree reduction,
- so use atomics instead:
- each thread atomically writes to foo
- each thread atomically writes to bar
-```
-
-The function can_block_reduce() is defined as follows:
-
-
-```
-int32_t can_block_reduce() {
- int tid = GetThreadIdInTeam();
- int nt = GetNumberOfOmpThreads(tid);
- if (nt != blockDim.x)
- return 0;
- unsigned tnum = __ballot(1);
- if (tnum != (~0x0)) {
- return 0;
- }
- return 1;
-}
-```
-
-This function permits the use of the efficient block reduction algorithm
-using shuffles and shared memory (return 1) only if (a) all SIMT threads in
-a warp are active (i.e., number of threads in the parallel region is a
-multiple of 32) and (b) the number of threads in the parallel region
-(set by the num_threads clause) equals blockDim.x.
-
-If either of these preconditions is not true, each thread in the threadblock
-updates the global value using atomics.
-
-Atomics and compare-and-swap operations are expensive on many threaded
-architectures such as GPUs and we must avoid them completely.
-
-
-**Appendix: Implementation Details**
-
-
-```
-// Compiler generated function.
-reduceFn(ReduceData *a, ReduceData *b)
- a->foo = a->foo + b->foo
- a->bar = a->bar + b->bar
-
-// Compiler generated function.
-swapAndReduceFn(ReduceData *thread_private, int lane)
- ReduceData *remote = new ReduceData()
- remote->foo = shuffle_double(thread_private->foo, lane)
- remote->bar = shuffle_double(thread_private->bar, lane)
- reduceFn(thread_private, remote)
-
-// OMP runtime function.
-warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
- offset = 16
- while (offset > 0)
- swapAndReduceFn(thread_private, offset)
- offset /= 2
-
-// OMP runtime function.
-warpReduce_irregular():
- ...
-
-// OMP runtime function.
-kmpc_reduce_warp(reduceData, swapAndReduceFn)
- if all_lanes_active:
- warpReduce_regular(reduceData, swapAndReduceFn)
- else:
- warpReduce_irregular(reduceData, swapAndReduceFn)
- if in_simd_region:
- // all done, reduce to global in simd lane 0
- return 1
- else if in_parallel_region:
- // done reducing to one value per warp, now reduce across warps
- return 3
-
-// OMP runtime function; one for each basic type.
-kmpc_reduce_block_double(double *a)
- if lane == 0:
- shared[wid] = *a
- named_barrier(1, num_threads)
- if wid == 0
- block_reduce(shared)
- if lane == 0
- *a = shared[0]
- named_barrier(1, num_threads)
- if wid == 0 and lane == 0
- return 1 // write back reduced result
- else
- return 0 // don't do anything
-
-```
-
-
-
-```
-// Compiler generated code.
- 1. Create private copies of variables: foo_p, bar_p
- 2. Each thread reduces the chunk of A and B assigned to it and writes
- to foo_p and bar_p respectively.
- 3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
- 4. if ret == 1:
- The master thread stores the reduced result in the globals.
- foo += reduceData.foo; bar += reduceData.bar
- 5. else if ret == 3:
- ret = block_reduce_double(reduceData.foo)
- if ret == 1:
- foo += reduceData.foo
- ret = block_reduce_double(reduceData.bar)
- if ret == 1:
- bar += reduceData.bar
-```
-
-**Notes**
-
- 1. This scheme requires that the CUDA OMP runtime can call llvm generated
- functions. This functionality now works.
- 2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery
- (including calls through function pointers) are optimized away.
- 3. If we are reducing multiple to multiple variables in a parallel region,
- the reduce operations are all performed in warpReduce_[ir]regular(). This
- results in more instructions in the loop and should result in fewer
- stalls due to data dependencies. Unfortunately we cannot do the same in
- kmpc_reduce_block_double() without increasing shared memory usage.
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
deleted file mode 100644
index b8fd91edcd8f7..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ /dev/null
@@ -1,17 +0,0 @@
-//===--- nvptx_interface.h - OpenMP interface definitions -------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _NVPTX_INTERFACE_H_
-#define _NVPTX_INTERFACE_H_
-
-#include <stdint.h>
-
-#define EXTERN extern "C"
-typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
deleted file mode 100644
index c442596933b36..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ /dev/null
@@ -1,198 +0,0 @@
-//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-#pragma omp declare target
-
-#include "common/debug.h"
-#include "target_impl.h"
-#include "target_interface.h"
-
-EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
- asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
-}
-
-EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
- uint64_t val;
- asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
- return val;
-}
-
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
- __kmpc_impl_lanemask_t res;
- asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
- return res;
-}
-
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
- __kmpc_impl_lanemask_t res;
- asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
- return res;
-}
-
-EXTERN uint32_t __kmpc_impl_smid() {
- uint32_t id;
- asm("mov.u32 %0, %%smid;" : "=r"(id));
- return id;
-}
-
-EXTERN double __kmpc_impl_get_wtick() {
- // Timer precision is 1ns
- return ((double)1E-9);
-}
-
-EXTERN double __kmpc_impl_get_wtime() {
- unsigned long long nsecs;
- asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
- return (double)nsecs * __kmpc_impl_get_wtick();
-}
-
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
- unsigned int Mask;
- asm volatile("activemask.b32 %0;" : "=r"(Mask));
- return Mask;
-}
-
-EXTERN void __kmpc_impl_syncthreads() {
- int barrier = 2;
- asm volatile("barrier.sync %0;"
- :
- : "r"(barrier)
- : "memory");
-}
-
-EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
- __nvvm_bar_warp_sync(Mask);
-}
-
-// NVPTX specific kernel initialization
-EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
-}
-
-// Barrier until num_threads arrive.
-EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
- // The named barrier for active parallel threads of a team in an L1 parallel
- // region to synchronize with each other.
- int barrier = 1;
- asm volatile("barrier.sync %0, %1;"
- :
- : "r"(barrier), "r"(num_threads)
- : "memory");
-}
-
-EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
-EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
-EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
-
-// Calls to the NVPTX layer (assuming 1D layout)
-EXTERN int __kmpc_get_hardware_thread_id_in_block() {
- return __nvvm_read_ptx_sreg_tid_x();
-}
-EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
-EXTERN int __kmpc_get_hardware_num_blocks() {
- return __nvvm_read_ptx_sreg_nctaid_x();
-}
-EXTERN int __kmpc_get_hardware_num_threads_in_block() {
- return __nvvm_read_ptx_sreg_ntid_x();
-}
-EXTERN unsigned __kmpc_get_warp_size() { return WARPSIZE; }
-EXTERN unsigned GetWarpId() {
- return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE;
-}
-EXTERN unsigned GetLaneId() {
- return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1);
-}
-
-// Atomics
-uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
- return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
-}
-uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
- return __nvvm_atom_inc_gen_ui(Address, Val);
-}
-
-uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
- return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
-}
-
-uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
- uint32_t R;
- __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
- return R;
-}
-
-uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
- (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
- __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
- return Compare;
-}
-
-unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
- unsigned long long Val) {
- unsigned long long R;
- __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
- return R;
-}
-
-unsigned long long __kmpc_atomic_add(unsigned long long *Address,
- unsigned long long Val) {
- return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
-}
-
-#define __OMP_SPIN 1000
-#define UNSET 0u
-#define SET 1u
-
-EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
- __kmpc_impl_unset_lock(lock);
-}
-
-EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
- __kmpc_impl_unset_lock(lock);
-}
-
-EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
- // TODO: not sure spinning is a good idea here..
- while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
- int32_t start = __nvvm_read_ptx_sreg_clock();
- int32_t now;
- for (;;) {
- now = __nvvm_read_ptx_sreg_clock();
- int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
- if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
- break;
- }
- }
- } // wait for 0 to be the read value
-}
-
-EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
- (void)__kmpc_atomic_exchange(lock, UNSET);
-}
-
-EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
- return __kmpc_atomic_add(lock, 0u);
-}
-
-extern "C" {
-void *malloc(size_t);
-void free(void *);
-int32_t vprintf(const char *, void *);
-}
-
-EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
-EXTERN void __kmpc_impl_free(void *x) { free(x); }
-
-EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments,
- uint32_t) {
- return vprintf(Format, Arguments);
-}
-
-#pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
deleted file mode 100644
index ab471e3116327..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ /dev/null
@@ -1,89 +0,0 @@
-//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definitions of target specific functions
-//
-//===----------------------------------------------------------------------===//
-#ifndef _TARGET_IMPL_H_
-#define _TARGET_IMPL_H_
-
-#include "nvptx_interface.h"
-
-#include <stddef.h>
-#include <stdint.h>
-
-// subset of inttypes.h
-#define PRId64 "ld"
-#define PRIu64 "lu"
-
-typedef uint32_t __kmpc_impl_lanemask_t;
-
-#define INLINE inline __attribute__((always_inline))
-#define NOINLINE __attribute__((noinline))
-#define ALIGN(N) __attribute__((aligned(N)))
-#define PLUGIN_ACCESSIBLE /* no annotation needed for cuda plugin */
-
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-
-INLINE constexpr const llvm::omp::GV &getGridValue() {
- return llvm::omp::NVPTXGridValues;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Kernel options
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-// The following def must match the absolute limit hardwired in the host RTL
-// max number of threads per team
-enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
-enum { WARPSIZE = getGridValue().GV_Warp_Size };
-
-// Maximum number of omp state objects per SM allocated statically in global
-// memory.
-#if __CUDA_ARCH__ >= 600
-#define OMP_STATE_COUNT 32
-#else
-#define OMP_STATE_COUNT 16
-#endif
-
-#if !defined(MAX_SM)
-#if __CUDA_ARCH__ >= 900
-#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
-#elif __CUDA_ARCH__ >= 800
-// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
-// GA102 design has a maxinum of 84 SMs
-#define MAX_SM 108
-#elif __CUDA_ARCH__ >= 700
-#define MAX_SM 84
-#elif __CUDA_ARCH__ >= 600
-#define MAX_SM 56
-#else
-#define MAX_SM 16
-#endif
-#endif
-
-#define OMP_ACTIVE_PARALLEL_LEVEL 128
-
-// Data sharing related quantities, need to match what is used in the compiler.
-enum DATA_SHARING_SIZES {
- // The size reserved for data in a shared memory slot.
- DS_Slot_Size = getGridValue().GV_Slot_Size,
- // The slot size that should be reserved for a working warp.
- DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
- // The maximum number of warps in use
- DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
-};
-
-enum : __kmpc_impl_lanemask_t {
- __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
-};
-
-#define printf(...)
-
-#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
deleted file mode 100644
index df6f665329ead..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang")
- # Silently return, no need to annoy the user.
- return()
-endif()
-
-set(deps omptarget omp)
-if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB)
- set(deps ${deps} omptarget-nvptx-bc)
-endif()
-
-# Run with only one thread to only launch one application to the GPU at a time.
-add_openmp_testsuite(check-libomptarget-nvptx
- "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR}
- EXCLUDE_FROM_CHECK_ALL
- DEPENDS ${deps} ARGS -j1)
-
-set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING
- "Extra compiler flags to send to the test compiler.")
-set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS
- "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING
- "OpenMP compiler flags to use for testing libomptarget-nvptx.")
-
-# Configure the lit.site.cfg.in file
-set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!")
-configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
deleted file mode 100644
index 60254bc7ed2e2..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %compile-run-and-check
-#include <omp.h>
-#include <stdio.h>
-
-int main(){
- int max_threads = -1;
- int num_threads = -1;
-
- #pragma omp target map(tofrom: max_threads)
- max_threads = omp_get_max_threads();
-
- #pragma omp target parallel map(tofrom: num_threads)
- {
- #pragma omp master
- num_threads = omp_get_num_threads();
- }
-
- // CHECK: Max Threads: 128, Num Threads: 128
- printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads);
-
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
deleted file mode 100644
index 1fa9ae024f6f5..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-const int MaxThreads = 1024;
-
-int main(int argc, char *argv[]) {
- int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1;
-
- #pragma omp target map(cancellation, dynamic, nested, maxActiveLevels)
- {
- // libomptarget-nvptx doesn't support cancellation.
- cancellation = omp_get_cancellation();
-
- // No support for dynamic adjustment of the number of threads.
- omp_set_dynamic(1);
- dynamic = omp_get_dynamic();
-
- // libomptarget-nvptx doesn't support nested parallelism.
- omp_set_nested(1);
- nested = omp_get_nested();
-
- omp_set_max_active_levels(42);
- maxActiveLevels = omp_get_max_active_levels();
- }
-
- // CHECK: cancellation = 0
- printf("cancellation = %d\n", cancellation);
- // CHECK: dynamic = 0
- printf("dynamic = %d\n", dynamic);
- // CHECK: nested = 0
- printf("nested = %d\n", nested);
- // CHECK: maxActiveLevels = 1
- printf("maxActiveLevels = %d\n", maxActiveLevels);
-
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
deleted file mode 100644
index efb418fef9a0b..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
- int MaxThreadsL1 = -1, MaxThreadsL2 = -1;
-
-#pragma omp declare reduction(unique:int \
- : omp_out = (omp_in == 1 ? omp_in : omp_out)) \
- initializer(omp_priv = -1)
-
- // Non-SPMD mode.
-#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32) \
- num_teams(1)
- {
- MaxThreadsL1 = omp_get_max_threads();
-#pragma omp parallel reduction(unique : MaxThreadsL2)
- { MaxThreadsL2 = omp_get_max_threads(); }
- }
-
- //FIXME: This Non-SPMD kernel will have 32 active threads due to
- // thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of
- // threads in block (64 in this case), which translates to worker
- // threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD
- // kernels. According to the spec, omp_get_max_threads must return the
- // max active threads possible between the two kernel types.
-
- // CHECK: Non-SPMD MaxThreadsL1 = 64
- printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1);
- // CHECK: Non-SPMD MaxThreadsL2 = 1
- printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2);
-
- // SPMD mode with full runtime
- MaxThreadsL2 = -1;
-#pragma omp target parallel reduction(unique : MaxThreadsL2)
- { MaxThreadsL2 = omp_get_max_threads(); }
-
- // CHECK: SPMD with full runtime MaxThreadsL2 = 1
- printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
-
- // SPMD mode without runtime
- MaxThreadsL2 = -1;
-#pragma omp target parallel for reduction(unique : MaxThreadsL2)
- for (int I = 0; I < 2; ++I) {
- MaxThreadsL2 = omp_get_max_threads();
- }
-
- // CHECK: SPMD without runtime MaxThreadsL2 = 1
- printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
-
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
deleted file mode 100644
index 626d620dc4f3a..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
- int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1;
-
-#pragma omp declare reduction(unique64:int \
- : omp_out = (omp_in == 64 ? omp_in : omp_out)) \
- initializer(omp_priv = -1)
-#pragma omp declare reduction(unique32:int \
- : omp_out = (omp_in == 32 ? omp_in : omp_out)) \
- initializer(omp_priv = -1)
-
- // Non-SPMD mode.
-#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2) \
- thread_limit(64) num_teams(1)
- {
- ThreadLimitL0 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique64 \
- : ThreadLimitL1, ThreadLimitL2) num_threads(32)
- {
- ThreadLimitL1 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique64 : ThreadLimitL2)
- { ThreadLimitL2 = omp_get_thread_limit(); }
- }
- }
-
- // CHECK: Non-SPMD ThreadLimitL0 = 64
- printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0);
- // CHECK: Non-SPMD ThreadLimitL1 = 64
- printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1);
- // CHECK: Non-SPMD ThreadLimitL2 = 64
- printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2);
-
- // SPMD mode with full runtime
- ThreadLimitL1 = -1;
- ThreadLimitL2 = -1;
-#pragma omp target parallel reduction(unique32 \
- : ThreadLimitL1, ThreadLimitL2) \
- num_threads(32)
- {
- ThreadLimitL1 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique32 : ThreadLimitL2)
- { ThreadLimitL2 = omp_get_thread_limit(); }
- }
-
- // CHECK: SPMD with full runtime ThreadLimitL1 = 32
- printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
- // CHECK: SPMD with full runtime ThreadLimitL2 = 32
- printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
-
- // SPMD mode without runtime
- ThreadLimitL1 = -1;
- ThreadLimitL2 = -1;
-#pragma omp target parallel for reduction(unique32 \
- : ThreadLimitL1, ThreadLimitL2) \
- num_threads(32)
- for (int I = 0; I < 2; ++I) {
- ThreadLimitL1 = omp_get_thread_limit();
-#pragma omp parallel reduction(unique32 : ThreadLimitL2)
- { ThreadLimitL2 = omp_get_thread_limit(); }
- }
-
- // CHECK: SPMD without runtime ThreadLimitL1 = 32
- printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
- // CHECK: SPMD without runtime ThreadLimitL2 = 32
- printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
-
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c b/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
deleted file mode 100644
index dd17ae7c6a76c..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-#pragma omp declare target
-static void putValueInParallel(int *ptr, int value) {
- #pragma omp parallel
- {
- *ptr = value;
- }
-}
-
-static int getId() {
- int id;
- putValueInParallel(&id, omp_get_thread_num());
- return id;
-}
-#pragma omp end declare target
-
-const int MaxThreads = 1024;
-const int Threads = 64;
-
-int main(int argc, char *argv[]) {
- int master;
- int check[MaxThreads];
- for (int i = 0; i < MaxThreads; i++) {
- check[i] = 0;
- }
-
- #pragma omp target map(master, check[:])
- {
- master = getId();
-
- #pragma omp parallel num_threads(Threads)
- {
- check[omp_get_thread_num()] = getId();
- }
- }
-
- // CHECK: master = 0.
- printf("master = %d.\n", master);
- // CHECK-NOT: invalid
- for (int i = 0; i < MaxThreads; i++) {
- if (i < Threads) {
- if (check[i] != i) {
- printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]);
- }
- } else if (check[i] != 0) {
- printf("invalid: check[%d] should be 0, is %d\n", i, check[i]);
- }
- }
-
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg
deleted file mode 100644
index 954059bd02c2e..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg
+++ /dev/null
@@ -1,76 +0,0 @@
-# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
-# Configuration file for the 'lit' test runner.
-
-import os
-import lit.formats
-
-# Tell pylint that we know config and lit_config exist somewhere.
-if 'PYLINT_IMPORT' in os.environ:
- config = object()
- lit_config = object()
-
-def prepend_library_path(name, value, sep):
- if name in config.environment:
- config.environment[name] = value + sep + config.environment[name]
- else:
- config.environment[name] = value
-
-# name: The name of this test suite.
-config.name = 'libomptarget-nvptx'
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.cc']
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root object directory where output is placed
-config.test_exec_root = config.binary_dir
-
-# test format
-config.test_format = lit.formats.ShTest()
-
-# compiler flags
-config.test_flags = " -I " + config.omp_header_directory + \
- " -L " + config.library_dir
-
-if config.omp_host_rtl_directory:
- config.test_flags = config.test_flags + \
- " -L " + config.omp_host_rtl_directory
-
-config.test_flags = config.test_flags + " " + config.test_extra_flags
-
-# Setup environment to find dynamic library at runtime.
-prepend_library_path('LIBRARY_PATH', config.library_dir, ":")
-prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
-prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":")
-if config.cuda_libdir:
- prepend_library_path('LD_LIBRARY_PATH', config.cuda_libdir, ":")
-
-# Forbid fallback to host.
-config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY"
-
-# substitutions
-config.substitutions.append(("%compilexx-run-and-check",
- "%compilexx-and-run | " + config.libomptarget_filecheck + " %s"))
-config.substitutions.append(("%compile-run-and-check",
- "%compile-and-run | " + config.libomptarget_filecheck + " %s"))
-config.substitutions.append(("%compilexx-and-run", "%compilexx && %run"))
-config.substitutions.append(("%compile-and-run", "%compile && %run"))
-
-config.substitutions.append(("%compilexx",
- "%clangxx %openmp_flags %cuda_flags %flags %s -o %t"))
-config.substitutions.append(("%compile",
- "%clang %openmp_flags %cuda_flags %flags %s -o %t"))
-
-config.substitutions.append(("%clangxx", config.test_cxx_compiler))
-config.substitutions.append(("%clang", config.test_c_compiler))
-config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
-if config.cuda_path:
- config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
-else:
- config.substitutions.append(("%cuda_flags", ""))
-config.substitutions.append(("%flags", config.test_flags))
-
-config.substitutions.append(("%run", "%t"))
-config.substitutions.append(("%not", config.libomptarget_not))
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
deleted file mode 100644
index 22374d5e3566c..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
+++ /dev/null
@@ -1,17 +0,0 @@
- at AUTO_GEN_COMMENT@
-
-config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
-config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
-config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@"
-config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@"
-config.cuda_path = "@CUDA_TOOLKIT_ROOT_DIR@"
-config.cuda_libdir = "@CUDA_LIBDIR@"
-config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@"
-config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
-config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
-config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
-config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
-config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
-
-# Let the main config do the real work.
-lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c
deleted file mode 100644
index 7c707718e13bd..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
- int data, out, flag = 0;
-#pragma omp target teams num_teams(2) map(tofrom \
- : out) map(to \
- : data, flag) \
- thread_limit(1)
-#pragma omp parallel num_threads(1)
- {
- if (omp_get_team_num() == 0) {
- /* Write to the data buffer that will be read by thread in team 1 */
- data = 42;
-/* Flush data to thread in team 1 */
-#pragma omp barrier
- /* Set flag to release thread in team 1 */
-#pragma omp atomic write
- flag = 1;
- } else if (omp_get_team_num() == 1) {
- /* Loop until we see the update to the flag */
- int val;
- do {
-#pragma omp atomic read
- val = flag;
- } while (val < 1);
- out = data;
-#pragma omp barrier
- }
- }
- // CHECK: out=42.
- /* Value of out will be 42 */
- printf("out=%d.\n", out);
- return !(out == 42);
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
deleted file mode 100644
index 412538b6dd156..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main(int argc, char *argv[]) {
- int data, out, flag = 0;
-#pragma omp target parallel num_threads(64) map(tofrom \
- : out, flag) map(to \
- : data)
- {
- if (omp_get_thread_num() == 0) {
- /* Write to the data buffer that will be read by thread */
- data = 42;
-/* Flush data to thread 32 */
-#pragma omp flush(data)
- /* Set flag to release thread 32 */
-#pragma omp atomic write
- flag = 1;
- } else if (omp_get_thread_num() == 32) {
- /* Loop until we see the update to the flag */
- int val;
- do {
-#pragma omp atomic read
- val = flag;
- } while (val < 1);
- out = data;
-#pragma omp flush(out)
- }
- }
- // CHECK: out=42.
- /* Value of out will be 42 */
- printf("out=%d.\n", out);
- return !(out == 42);
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
deleted file mode 100644
index 0a137530cef74..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
+++ /dev/null
@@ -1,151 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-const int MaxThreads = 1024;
-const int NumThreads = 64;
-
-int main(int argc, char *argv[]) {
- int level = -1, activeLevel = -1;
- // The expected value is -1, initialize to
diff erent value.
- int ancestorTNumNeg = 1, teamSizeNeg = 1;
- int ancestorTNum0 = -1, teamSize0 = -1;
- // The expected value is -1, initialize to
diff erent value.
- int ancestorTNum1 = 1, teamSize1 = 1;
- int check1[MaxThreads];
- int check2[MaxThreads];
- int check3[MaxThreads];
- int check4[MaxThreads];
- for (int i = 0; i < MaxThreads; i++) {
- check1[i] = check2[i] = check3[i] = check4[i] = 0;
- }
-
- #pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \
- map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \
- map(check1[:], check2[:], check3[:], check4[:])
- {
- level = omp_get_level();
- activeLevel = omp_get_active_level();
-
- // Expected to return -1.
- ancestorTNumNeg = omp_get_ancestor_thread_num(-1);
- teamSizeNeg = omp_get_team_size(-1);
-
- // Expected to return 0 and 1.
- ancestorTNum0 = omp_get_ancestor_thread_num(0);
- teamSize0 = omp_get_team_size(0);
-
- // Expected to return -1 because the requested level is larger than
- // the nest level.
- ancestorTNum1 = omp_get_ancestor_thread_num(1);
- teamSize1 = omp_get_team_size(1);
-
- // Expecting active parallel region.
- #pragma omp parallel num_threads(NumThreads)
- {
- int id = omp_get_thread_num();
- // Multiply return value of omp_get_level by 5 to avoid that this test
- // passes if both API calls return wrong values.
- check1[id] += omp_get_level() * 5 + omp_get_active_level();
-
- // Expected to return 0 and 1.
- check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
- // Expected to return the current thread num.
- check2[id] += (omp_get_ancestor_thread_num(1) - id);
- // Expected to return the current number of threads.
- check2[id] += 3 * omp_get_team_size(1);
- // Expected to return -1, see above.
- check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2);
-
- // Expecting serialized parallel region.
- #pragma omp parallel
- {
- #pragma omp atomic
- check3[id] += omp_get_level() * 5 + omp_get_active_level();
-
- // Expected to return 0 and 1.
- int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
- // Expected to return the parent thread num.
- check4Inc += (omp_get_ancestor_thread_num(1) - id);
- // Expected to return the number of threads in the active parallel region.
- check4Inc += 3 * omp_get_team_size(1);
- // Expected to return 0 and 1.
- check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2);
- // Expected to return -1, see above.
- check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3);
-
- #pragma omp atomic
- check4[id] += check4Inc;
- }
- }
- }
-
- // CHECK: target: level = 0, activeLevel = 0
- printf("target: level = %d, activeLevel = %d\n", level, activeLevel);
- // CHECK: level = -1: ancestorTNum = -1, teamSize = -1
- printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg);
- // CHECK: level = 0: ancestorTNum = 0, teamSize = 1
- printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0);
- // CHECK: level = 1: ancestorTNum = -1, teamSize = -1
- printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1);
-
- // CHECK-NOT: invalid
- for (int i = 0; i < MaxThreads; i++) {
- // Check active parallel region:
- // omp_get_level() = 1, omp_get_active_level() = 1
- const int Expected1 = 6;
- if (i < NumThreads) {
- if (check1[i] != Expected1) {
- printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]);
- }
- } else if (check1[i] != 0) {
- printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
- }
-
- // 5 * 1 + 3 * 64 - 1 - 1 (see above)
- const int Expected2 = 195;
- if (i < NumThreads) {
- if (check2[i] != Expected2) {
- printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]);
- }
- } else if (check2[i] != 0) {
- printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
- }
-
- // Check serialized parallel region:
- // omp_get_level() = 2, omp_get_active_level() = 1
- const int Expected3 = 11;
- if (i < NumThreads) {
- if (check3[i] != Expected3) {
- printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]);
- }
- } else if (check3[i] != 0) {
- printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
- }
-
- // 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above)
- const int Expected4 = 198;
- if (i < NumThreads) {
- if (check4[i] != Expected4) {
- printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]);
- }
- } else if (check4[i] != 0) {
- printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
- }
- }
-
- // Check for paraller level in non-SPMD kernels.
- level = 0;
- #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
- for (int i=0; i<5032; i+=32) {
- int ub = (i+32 > 5032) ? 5032 : i+32;
- #pragma omp parallel for schedule(dynamic)
- for (int j=i ; j < ub; j++) ;
- level += omp_get_level();
- }
- // CHECK: Integral level = 0.
- printf("Integral level = %d.\n", level);
-
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
deleted file mode 100644
index 70ebb1da9592e..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
+++ /dev/null
@@ -1,136 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-const int MaxThreads = 1024;
-const int NumThreads = 64;
-const int NumThreads1 = 1;
-
-int main(int argc, char *argv[]) {
- int inParallel = -1, numThreads = -1, threadNum = -1;
- int check1[MaxThreads];
- int check2[MaxThreads];
- for (int i = 0; i < MaxThreads; i++) {
- check1[i] = check2[i] = 0;
- }
-
-#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
- {
- inParallel = omp_in_parallel();
- numThreads = omp_get_num_threads();
- threadNum = omp_get_thread_num();
-
-// Expecting active parallel region.
-#pragma omp parallel num_threads(NumThreads)
- {
- int id = omp_get_thread_num();
- check1[id] += omp_get_num_threads() + omp_in_parallel();
-
-// Expecting serialized parallel region.
-#pragma omp parallel
- {
- // Expected to be 1.
- int nestedInParallel = omp_in_parallel();
- // Expected to be 1.
- int nestedNumThreads = omp_get_num_threads();
- // Expected to be 0.
- int nestedThreadNum = omp_get_thread_num();
-#pragma omp atomic
- check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
- }
- }
- }
-
- // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
- printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
- inParallel, numThreads, threadNum);
-
- // CHECK-NOT: invalid
- for (int i = 0; i < MaxThreads; i++) {
- // Check that all threads reported
- // omp_get_num_threads() = 64, omp_in_parallel() = 1.
- int Expected = NumThreads + 1;
- if (i < NumThreads) {
- if (check1[i] != Expected) {
- printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
- check1[i]);
- }
- } else if (check1[i] != 0) {
- printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
- }
-
- // Check serialized parallel region.
- if (i < NumThreads) {
- if (check2[i] != 2) {
- printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]);
- }
- } else if (check2[i] != 0) {
- printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
- }
- }
-
- inParallel = -1;
- numThreads = -1;
- threadNum = -1;
- for (int i = 0; i < MaxThreads; i++) {
- check1[i] = check2[i] = 0;
- }
-
-#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
- {
- inParallel = omp_in_parallel();
- numThreads = omp_get_num_threads();
- threadNum = omp_get_thread_num();
-
-// Expecting active parallel region.
-#pragma omp parallel num_threads(NumThreads1)
- {
- int id = omp_get_thread_num();
- check1[id] += omp_get_num_threads() + omp_in_parallel();
-
-// Expecting serialized parallel region.
-#pragma omp parallel
- {
- // Expected to be 0.
- int nestedInParallel = omp_in_parallel();
- // Expected to be 1.
- int nestedNumThreads = omp_get_num_threads();
- // Expected to be 0.
- int nestedThreadNum = omp_get_thread_num();
-#pragma omp atomic
- check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
- }
- }
- }
-
- // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
- printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
- inParallel, numThreads, threadNum);
-
- // CHECK-NOT: invalid
- for (int i = 0; i < MaxThreads; i++) {
- // Check that all threads reported
- // omp_get_num_threads() = 1, omp_in_parallel() = 0.
- int Expected = 1;
- if (i < NumThreads1) {
- if (check1[i] != Expected) {
- printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
- check1[i]);
- }
- } else if (check1[i] != 0) {
- printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
- }
-
- // Check serialized parallel region.
- if (i < NumThreads1) {
- if (check2[i] != 1) {
- printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]);
- }
- } else if (check2[i] != 0) {
- printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
- }
- }
-
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
deleted file mode 100644
index 4a2f73fee827a..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <stdio.h>
-#include <omp.h>
-
-const int WarpSize = 32;
-const int NumThreads1 = 1 * WarpSize;
-const int NumThreads2 = 2 * WarpSize;
-const int NumThreads3 = 3 * WarpSize;
-const int MaxThreads = 1024;
-
-int main(int argc, char *argv[]) {
- int check1[MaxThreads];
- int check2[MaxThreads];
- int check3[MaxThreads];
- int check4[MaxThreads];
- for (int i = 0; i < MaxThreads; i++) {
- check1[i] = check2[i] = check3[i] = check4[i] = 0;
- }
-
- int maxThreads1 = -1;
- int maxThreads2 = -1;
- int maxThreads3 = -1;
-
- #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
- map(maxThreads1, maxThreads2, maxThreads3)
- {
- #pragma omp parallel num_threads(NumThreads1)
- {
- check1[omp_get_thread_num()] += omp_get_num_threads();
- }
-
- // API method to set number of threads in parallel regions without
- // num_threads() clause.
- omp_set_num_threads(NumThreads2);
- maxThreads1 = omp_get_max_threads();
- #pragma omp parallel
- {
- check2[omp_get_thread_num()] += omp_get_num_threads();
- }
-
- maxThreads2 = omp_get_max_threads();
-
- // num_threads() clause should override nthreads-var ICV.
- #pragma omp parallel num_threads(NumThreads3)
- {
- check3[omp_get_thread_num()] += omp_get_num_threads();
- }
-
- maxThreads3 = omp_get_max_threads();
-
- // Effect from omp_set_num_threads() should still be visible.
- #pragma omp parallel
- {
- check4[omp_get_thread_num()] += omp_get_num_threads();
- }
- }
-
- // CHECK: maxThreads1 = 64
- printf("maxThreads1 = %d\n", maxThreads1);
- // CHECK: maxThreads2 = 64
- printf("maxThreads2 = %d\n", maxThreads2);
- // CHECK: maxThreads3 = 64
- printf("maxThreads3 = %d\n", maxThreads3);
-
- // CHECK-NOT: invalid
- for (int i = 0; i < MaxThreads; i++) {
- if (i < NumThreads1) {
- if (check1[i] != NumThreads1) {
- printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
- }
- } else if (check1[i] != 0) {
- printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
- }
-
- if (i < NumThreads2) {
- if (check2[i] != NumThreads2) {
- printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
- }
- } else if (check2[i] != 0) {
- printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
- }
-
- if (i < NumThreads3) {
- if (check3[i] != NumThreads3) {
- printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
- }
- } else if (check3[i] != 0) {
- printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
- }
-
- if (i < NumThreads2) {
- if (check4[i] != NumThreads2) {
- printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
- }
- } else if (check4[i] != 0) {
- printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
- }
- }
-
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
deleted file mode 100644
index 517db59f64ae3..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// RUN: %compilexx-run-and-check
-
-#include <stdio.h>
-#include <omp.h>
-
-int main(void) {
- int isHost = -1;
- int ParallelLevel1 = -1, ParallelLevel2 = -1;
- int Count = 0;
-
-#pragma omp target parallel for map(tofrom \
- : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
- for (int J = 0; J < 10; ++J) {
-#pragma omp critical
- {
- isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
- ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
- ? omp_get_level()
- : ParallelLevel1;
- }
- if (omp_get_thread_num() > 5) {
- int L2;
-#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
- for (int I = 0; I < 10; ++I) {
- L2 = omp_get_level();
- Count += omp_get_level(); // (10-6)*10*2 = 80
- }
-#pragma omp critical
- ParallelLevel2 =
- (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
- } else {
- Count += omp_get_level(); // 6 * 1 = 6
- }
- }
-
- if (isHost < 0) {
- printf("Runtime error, isHost=%d\n", isHost);
- }
-
- // CHECK: Target region executed on the device
- printf("Target region executed on the %s\n", isHost ? "host" : "device");
- // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
- printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
- ParallelLevel2);
- // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
- // level) + 6(num of iterations) * 1(par level)
- // CHECK: Expected count = 86
- printf("Expected count = %d\n", Count);
-
- return isHost;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
deleted file mode 100644
index 5e40bb564aa0f..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <stdio.h>
-#include <omp.h>
-
-const int WarpSize = 32;
-const int ThreadLimit = 1 * WarpSize;
-const int NumThreads2 = 2 * WarpSize;
-const int NumThreads3 = 3 * WarpSize;
-const int MaxThreads = 1024;
-
-int main(int argc, char *argv[]) {
- int check1[MaxThreads];
- int check2[MaxThreads];
- int check3[MaxThreads];
- for (int i = 0; i < MaxThreads; i++) {
- check1[i] = check2[i] = check3[i] = 0;
- }
-
- int threadLimit = -1;
-
- #pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \
- map(check1[:], check2[:], check3[:], threadLimit)
- {
- threadLimit = omp_get_thread_limit();
-
- // All parallel regions should get as many threads as specified by the
- // thread_limit() clause.
- #pragma omp parallel
- {
- check1[omp_get_thread_num()] += omp_get_num_threads();
- }
-
- omp_set_num_threads(NumThreads2);
- #pragma omp parallel
- {
- check2[omp_get_thread_num()] += omp_get_num_threads();
- }
-
- #pragma omp parallel num_threads(NumThreads3)
- {
- check3[omp_get_thread_num()] += omp_get_num_threads();
- }
- }
-
- // CHECK: threadLimit = 32
- printf("threadLimit = %d\n", threadLimit);
-
- // CHECK-NOT: invalid
- for (int i = 0; i < MaxThreads; i++) {
- if (i < ThreadLimit) {
- if (check1[i] != ThreadLimit) {
- printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]);
- }
- } else if (check1[i] != 0) {
- printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
- }
-
- if (i < ThreadLimit) {
- if (check2[i] != ThreadLimit) {
- printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]);
- }
- } else if (check2[i] != 0) {
- printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
- }
-
- if (i < ThreadLimit) {
- if (check3[i] != ThreadLimit) {
- printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]);
- }
- } else if (check3[i] != 0) {
- printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
- }
- }
-
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
deleted file mode 100644
index b3f8768564080..0000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %compile-run-and-check
-
-#include <omp.h>
-#include <stdio.h>
-
-int main() {
- int res = 0;
-
-#pragma omp parallel num_threads(2) reduction(+:res)
- {
- int tid = omp_get_thread_num();
-#pragma omp target teams distribute reduction(+:res)
- for (int i = tid; i < 2; i++)
- ++res;
- }
- // The first thread makes 2 iterations, the second - 1. Expected result of the
- // reduction res is 3.
-
- // CHECK: res = 3.
- printf("res = %d.\n", res);
- return 0;
-}
diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h
deleted file mode 100644
index cf651974b71fa..0000000000000
--- a/openmp/libomptarget/deviceRTLs/target_interface.h
+++ /dev/null
@@ -1,78 +0,0 @@
-//===------------- target_interface.h - Target interfaces --------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains interfaces that must be implemented by each target.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_TARGET_INTERFACE_H_
-#define _OMPTARGET_TARGET_INTERFACE_H_
-
-#include <stdint.h>
-
-#include "DeviceEnvironment.h"
-#include "target_impl.h"
-
-// Calls to the NVPTX layer (assuming 1D layout)
-EXTERN int __kmpc_get_hardware_thread_id_in_block();
-EXTERN int GetBlockIdInKernel();
-EXTERN NOINLINE int __kmpc_get_hardware_num_blocks();
-EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block();
-EXTERN unsigned __kmpc_get_warp_size();
-EXTERN unsigned GetWarpId();
-EXTERN unsigned GetLaneId();
-
-// Atomics
-uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
-uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
-uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
-uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
-uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
-static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
-unsigned long long __kmpc_atomic_exchange(unsigned long long *,
- unsigned long long);
-unsigned long long __kmpc_atomic_add(unsigned long long *, unsigned long long);
-
-// Locks
-EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
-EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
-EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
-
-EXTERN void __kmpc_impl_threadfence();
-EXTERN void __kmpc_impl_threadfence_block();
-EXTERN void __kmpc_impl_threadfence_system();
-
-EXTERN double __kmpc_impl_get_wtick();
-EXTERN double __kmpc_impl_get_wtime();
-
-EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
-EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
-EXTERN uint32_t __kmpc_impl_smid();
-
-EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
-
-EXTERN void __kmpc_impl_syncthreads();
-EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
-
-// Kernel initialization
-EXTERN void __kmpc_impl_target_init();
-
-// Memory
-EXTERN void *__kmpc_impl_malloc(size_t);
-EXTERN void __kmpc_impl_free(void *);
-
-// Barrier until num_threads arrive.
-EXTERN void __kmpc_impl_named_sync(uint32_t num_threads);
-
-extern DeviceEnvironmentTy omptarget_device_environment;
-
-#endif // _OMPTARGET_TARGET_INTERFACE_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
index 92523c23f68b1..7d56763c81ee0 100644
--- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
@@ -118,6 +118,6 @@ if (${amdgpu_arch_result})
libomptarget_say("Not generating amdgcn test targets as amdgpu-arch exited with ${amdgpu_arch_result}")
else()
# Report to the parent scope that we are building a plugin for amdgpu
- set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa-newRTL " PARENT_SCOPE)
+ set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa " PARENT_SCOPE)
endif()
diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
index 4fa8b995bc219..4329386fda6f4 100644
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
@@ -72,7 +72,7 @@ target_link_libraries(omptarget.rtl.cuda
# Otherwise this plugin is being built speculatively and there may be no cuda available
if (LIBOMPTARGET_CAN_LINK_LIBCUDA OR LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
libomptarget_say("Enable tests using CUDA plugin")
- set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-newRTL nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE)
+ set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE)
else()
libomptarget_say("Disabling tests using CUDA plugin as cuda may not be available")
endif()
diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
index 98bf4eb5da39f..9189e51f12f8c 100644
--- a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
@@ -1,4 +1,4 @@
-// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
// RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
// REQUIRES: nvptx64-nvidia-cuda
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 63d327809f9e3..c262c05532249 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -104,17 +104,11 @@ else: # Unices
config.test_flags += " --libomptarget-amdgcn-bc-path=" + config.library_dir
if config.libomptarget_current_target.startswith('nvptx'):
config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir
- if config.libomptarget_current_target.endswith('-newRTL'):
- config.test_flags += " -fopenmp-target-new-runtime"
- elif not config.libomptarget_current_target.endswith('-newDriver'):
- config.test_flags += " -fno-openmp-target-new-runtime"
if config.libomptarget_current_target.endswith('-newDriver'):
config.test_flags += " -fopenmp-new-driver"
-def remove_newRTL_suffix_if_present(name):
- if name.endswith('-newRTL'):
- return name[:-7]
- elif name.endswith('-newDriver'):
+def remove_suffix_if_present(name):
+ if name.endswith('-newDriver'):
return name[:-10]
else:
return name
@@ -183,10 +177,10 @@ for libomptarget_target in config.libomptarget_all_targets:
"%not --crash %t"))
config.substitutions.append(("%clangxx-" + libomptarget_target, \
"%clangxx %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
- remove_newRTL_suffix_if_present(libomptarget_target)))
+ remove_suffix_if_present(libomptarget_target)))
config.substitutions.append(("%clang-" + libomptarget_target, \
"%clang %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
- remove_newRTL_suffix_if_present(libomptarget_target)))
+ remove_suffix_if_present(libomptarget_target)))
config.substitutions.append(("%fcheck-" + libomptarget_target, \
config.libomptarget_filecheck + " %s"))
else:
diff --git a/openmp/libomptarget/test/mapping/data_member_ref.cpp b/openmp/libomptarget/test/mapping/data_member_ref.cpp
index 5ac1a0b973630..6b52a04e34f1d 100644
--- a/openmp/libomptarget/test/mapping/data_member_ref.cpp
+++ b/openmp/libomptarget/test/mapping/data_member_ref.cpp
@@ -2,7 +2,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <stdio.h>
diff --git a/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp b/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
index 00d85723dffed..2520ab4bf15d3 100644
--- a/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
+++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
@@ -2,7 +2,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <cstdio>
#include <cstdlib>
diff --git a/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp b/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
index eadf4609bb4f7..c95c4962d180b 100644
--- a/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
+++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
@@ -2,7 +2,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <cstdio>
#include <cstdlib>
diff --git a/openmp/libomptarget/test/mapping/lambda_by_value.cpp b/openmp/libomptarget/test/mapping/lambda_by_value.cpp
index 711decb26b603..69a0ec705d1d7 100644
--- a/openmp/libomptarget/test/mapping/lambda_by_value.cpp
+++ b/openmp/libomptarget/test/mapping/lambda_by_value.cpp
@@ -2,7 +2,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <stdio.h>
#include <stdint.h>
diff --git a/openmp/libomptarget/test/mapping/lambda_mapping.cpp b/openmp/libomptarget/test/mapping/lambda_mapping.cpp
index d421dcec1f317..79ddf33b1ffd0 100644
--- a/openmp/libomptarget/test/mapping/lambda_mapping.cpp
+++ b/openmp/libomptarget/test/mapping/lambda_mapping.cpp
@@ -1,7 +1,7 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
// Error on the gpu that crashes the host
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
+// UNSUPPORTED: amdgcn-amd-amdhsa
#include <iostream>
diff --git a/openmp/libomptarget/test/mapping/ompx_hold/struct.c b/openmp/libomptarget/test/mapping/ompx_hold/struct.c
index 450b60178aa4c..b0d4761596e0c 100644
--- a/openmp/libomptarget/test/mapping/ompx_hold/struct.c
+++ b/openmp/libomptarget/test/mapping/ompx_hold/struct.c
@@ -3,7 +3,6 @@
// Wrong results on amdgpu
// XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <omp.h>
#include <stdio.h>
diff --git a/openmp/libomptarget/test/offloading/bug49021.cpp b/openmp/libomptarget/test/offloading/bug49021.cpp
index ca15efc0f6baf..909724108aa74 100644
--- a/openmp/libomptarget/test/offloading/bug49021.cpp
+++ b/openmp/libomptarget/test/offloading/bug49021.cpp
@@ -2,7 +2,6 @@
// Hangs
// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
// UNSUPPORTED: amdgcn-amd-amdhsa-newDriver
#include <iostream>
diff --git a/openmp/libomptarget/test/offloading/bug49334.cpp b/openmp/libomptarget/test/offloading/bug49334.cpp
index 4907d32ac9c0f..3ed1d5a035b1c 100644
--- a/openmp/libomptarget/test/offloading/bug49334.cpp
+++ b/openmp/libomptarget/test/offloading/bug49334.cpp
@@ -2,7 +2,6 @@
// Currently hangs on amdgpu
// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
// UNSUPPORTED: x86_64-pc-linux-gnu
#include <cassert>
diff --git a/openmp/libomptarget/test/offloading/bug51781.c b/openmp/libomptarget/test/offloading/bug51781.c
index 999e37b8db10b..b4aa68bdb3a7d 100644
--- a/openmp/libomptarget/test/offloading/bug51781.c
+++ b/openmp/libomptarget/test/offloading/bug51781.c
@@ -34,7 +34,6 @@
// Hangs
// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
// UNSUPPORTED: amdgcn-amd-amdhsa-newDriver
#if ADD_REDUCTION
diff --git a/openmp/libomptarget/test/offloading/global_constructor.cpp b/openmp/libomptarget/test/offloading/global_constructor.cpp
index ae602df8c32e3..d73fe1ad938f3 100644
--- a/openmp/libomptarget/test/offloading/global_constructor.cpp
+++ b/openmp/libomptarget/test/offloading/global_constructor.cpp
@@ -2,7 +2,6 @@
// Fails in DAGToDAG on an address space problem
// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
#include <cmath>
#include <cstdio>
diff --git a/openmp/libomptarget/test/offloading/host_as_target.c b/openmp/libomptarget/test/offloading/host_as_target.c
index 1fa7116444869..28573d8aa4569 100644
--- a/openmp/libomptarget/test/offloading/host_as_target.c
+++ b/openmp/libomptarget/test/offloading/host_as_target.c
@@ -9,7 +9,6 @@
// amdgpu does not have a working printf definition
// XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <stdio.h>
#include <omp.h>
diff --git a/openmp/libomptarget/test/unified_shared_memory/api.c b/openmp/libomptarget/test/unified_shared_memory/api.c
index c373a2ddb579c..1751fa0beea3a 100644
--- a/openmp/libomptarget/test/unified_shared_memory/api.c
+++ b/openmp/libomptarget/test/unified_shared_memory/api.c
@@ -1,11 +1,10 @@
// RUN: %libomptarget-compile-run-and-check-generic
// XFAIL: nvptx64-nvidia-cuda
-// XFAIL: nvptx64-nvidia-cuda-newRTL
+// XFAIL: nvptx64-nvidia-cuda
// XFAIL: nvptx64-nvidia-cuda-newDriver
// Fails on amdgpu with error: GPU Memory Error
// XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
// XFAIL: amdgcn-amd-amdhsa-newDriver
#include <stdio.h>
diff --git a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
index 7f1abe3de99db..ec292c857f9c7 100644
--- a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
@@ -5,7 +5,6 @@
// Fails on amdgpu with error: GPU Memory Error
// XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
#include <omp.h>
#include <stdio.h>
diff --git a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
index 6458e246328b0..ce368a359176b 100644
--- a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
@@ -5,7 +5,6 @@
// amdgpu runtime crash
// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
#include <omp.h>
diff --git a/openmp/libomptarget/test/unified_shared_memory/shared_update.c b/openmp/libomptarget/test/unified_shared_memory/shared_update.c
index bcd1ade4421b5..b211d333453e9 100644
--- a/openmp/libomptarget/test/unified_shared_memory/shared_update.c
+++ b/openmp/libomptarget/test/unified_shared_memory/shared_update.c
@@ -4,7 +4,6 @@
// amdgpu runtime crash
// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
#include <stdio.h>
#include <omp.h>
More information about the Openmp-commits
mailing list