[Openmp-commits] [openmp] fa4780f - [OpenMP][USM] Introduces -fopenmp-force-usm flag (#76571)
via Openmp-commits
openmp-commits at lists.llvm.org
Mon Jan 22 12:59:32 PST 2024
Author: Jan Patrick Lehr
Date: 2024-01-22T21:59:26+01:00
New Revision: fa4780fa6cc36188b84b2a977ac15351c39d45dd
URL: https://github.com/llvm/llvm-project/commit/fa4780fa6cc36188b84b2a977ac15351c39d45dd
DIFF: https://github.com/llvm/llvm-project/commit/fa4780fa6cc36188b84b2a977ac15351c39d45dd.diff
LOG: [OpenMP][USM] Introduces -fopenmp-force-usm flag (#76571)
This flag forces the compiler to generate code for OpenMP target regions
as if the user specified the #pragma omp requires unified_shared_memory
in each source file.
The option does not have a -fno-* friend since OpenMP requires the
unified_shared_memory clause to be present in all source files. Since
this flag does no harm if the clause is present, it can be used in
conjunction. My understanding is that USM should not be turned off
selectively, hence, no -fno- version.
This adds a basic test to check the correct generation of double
indirect access to declare target globals in USM mode vs non-USM mode.
Which I think is the only difference observable in code generation.
This runtime test checks for the (non-)occurence of data movement between host
and device. It does one run without the flag and one with the flag to
also see that both versions behave as expected. In the case w/o the new
flag data movement between host and device is expected. In the case with
the flag such data movement should not be present / reported.
Added:
clang/test/OpenMP/force-usm.c
openmp/libomptarget/test/offloading/force-usm.cpp
Modified:
clang/include/clang/Basic/LangOptions.def
clang/include/clang/Driver/Options.td
clang/lib/CodeGen/CGOpenMPRuntime.cpp
clang/lib/Driver/ToolChains/Clang.cpp
openmp/libomptarget/test/lit.cfg
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 47cbd3d30f84785..8fc75e1cca0399f 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -260,6 +260,7 @@ LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have mo
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
+LANGOPT(OpenMPForceUSM , 1, 0, "Enable OpenMP unified shared memory mode via compiler.")
LANGOPT(NoGPULib , 1, 0, "Indicate a build without the standard GPU libraries.")
LANGOPT(RenderScript , 1, 0, "RenderScript")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 637b10652fcd9cb..819f6f1a15c3f35 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3459,6 +3459,10 @@ def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<
Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
HelpText<"Do not create a host fallback if offloading to the device fails.">,
MarshallingInfoFlag<LangOpts<"OpenMPOffloadMandatory">>;
+def fopenmp_force_usm : Flag<["-"], "fopenmp-force-usm">, Group<f_Group>,
+ Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
+ HelpText<"Force behvaior as if the user specified pragma omp requires unified_shared_memory.">,
+ MarshallingInfoFlag<LangOpts<"OpenMPForceUSM">>;
def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group<f_Group>,
Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CLOption]>,
HelpText<"Emit code that can be JIT compiled for OpenMP offloading. Implies -foffload-lto=full">;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index ea6645a39e83218..4855e7410a015aa 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1044,6 +1044,13 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
? CGM.getLangOpts().OMPHostIRFile
: StringRef{});
OMPBuilder.setConfig(Config);
+
+ // The user forces the compiler to behave as if omp requires
+ // unified_shared_memory was given.
+ if (CGM.getLangOpts().OpenMPForceUSM) {
+ HasRequiresUnifiedSharedMemory = true;
+ OMPBuilder.Config.setHasRequiresUnifiedSharedMemory(true);
+ }
}
void CGOpenMPRuntime::clear() {
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 776a13b958893d9..2f33943de45c5da 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6460,6 +6460,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
CmdArgs.push_back("-fopenmp-offload-mandatory");
+ if (Args.hasArg(options::OPT_fopenmp_force_usm))
+ CmdArgs.push_back("-fopenmp-force-usm");
break;
default:
// By default, if Clang doesn't know how to generate useful OpenMP code
diff --git a/clang/test/OpenMP/force-usm.c b/clang/test/OpenMP/force-usm.c
new file mode 100644
index 000000000000000..5c63a9a5e70046d
--- /dev/null
+++ b/clang/test/OpenMP/force-usm.c
@@ -0,0 +1,79 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 3
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-force-usm -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-force-usm -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-USM %s
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-DEFAULT %s
+// expected-no-diagnostics
+
+extern "C" void *malloc(unsigned int b);
+
+int GI;
+#pragma omp declare target
+int *pGI;
+#pragma omp end declare target
+
+int main(void) {
+
+ GI = 0;
+
+ pGI = (int *) malloc(sizeof(int));
+ *pGI = 42;
+
+#pragma omp target map(pGI[:1], GI)
+ {
+ GI = 1;
+ *pGI = 2;
+ }
+
+ return 0;
+}
+
+// CHECK-USM-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25(
+// CHECK-USM-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-USM-NEXT: entry:
+// CHECK-USM-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-USM-NEXT: [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-USM-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// CHECK-USM-NEXT: [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
+// CHECK-USM-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
+// CHECK-USM-NEXT: store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-USM-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-USM-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]])
+// CHECK-USM-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-USM-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-USM: user_code.entry:
+// CHECK-USM-NEXT: store i32 1, ptr [[TMP0]], align 4
+// CHECK-USM-NEXT: [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8
+// CHECK-USM-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CHECK-USM-NEXT: store i32 2, ptr [[TMP3]], align 4
+// CHECK-USM-NEXT: call void @__kmpc_target_deinit()
+// CHECK-USM-NEXT: ret void
+// CHECK-USM: worker.exit:
+// CHECK-USM-NEXT: ret void
+//
+//
+// CHECK-DEFAULT-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25(
+// CHECK-DEFAULT-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-DEFAULT-NEXT: entry:
+// CHECK-DEFAULT-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-DEFAULT-NEXT: [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-DEFAULT-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// CHECK-DEFAULT-NEXT: [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
+// CHECK-DEFAULT-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
+// CHECK-DEFAULT-NEXT: store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-DEFAULT-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]])
+// CHECK-DEFAULT-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-DEFAULT-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-DEFAULT: user_code.entry:
+// CHECK-DEFAULT-NEXT: store i32 1, ptr [[TMP0]], align 4
+// CHECK-DEFAULT-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @pGI to ptr), align 8
+// CHECK-DEFAULT-NEXT: store i32 2, ptr [[TMP2]], align 4
+// CHECK-DEFAULT-NEXT: call void @__kmpc_target_deinit()
+// CHECK-DEFAULT-NEXT: ret void
+// CHECK-DEFAULT: worker.exit:
+// CHECK-DEFAULT-NEXT: ret void
+//
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index a0df7314fe974f5..d912b622c05ba1a 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -190,6 +190,8 @@ for libomptarget_target in config.libomptarget_all_targets:
"%libomptarget-compile-and-run-" + libomptarget_target))
config.substitutions.append(("%libomptarget-compilexx-generic",
"%libomptarget-compilexx-" + libomptarget_target))
+ config.substitutions.append(("%libomptarget-compilexxx-generic-force-usm",
+ "%libomptarget-compilexxx-force-usm-" + libomptarget_target))
config.substitutions.append(("%libomptarget-compile-generic",
"%libomptarget-compile-" + libomptarget_target))
config.substitutions.append(("%libomptarget-compile-fortran-generic",
@@ -247,6 +249,9 @@ for libomptarget_target in config.libomptarget_all_targets:
config.substitutions.append(("%libomptarget-compilexx-" + \
libomptarget_target, \
"%clangxx-" + libomptarget_target + add_libraries(" %s -o %t")))
+ config.substitutions.append(("%libomptarget-compilexxx-force-usm-" +
+ libomptarget_target, "%clangxxx-force-usm-" + libomptarget_target + \
+ add_libraries(" %s -o %t")))
config.substitutions.append(("%libomptarget-compile-" + \
libomptarget_target, \
"%clang-" + libomptarget_target + add_libraries(" %s -o %t")))
@@ -284,6 +289,9 @@ for libomptarget_target in config.libomptarget_all_targets:
config.substitutions.append(("%clangxx-" + libomptarget_target, \
"%clangxx %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
remove_suffix_if_present(libomptarget_target)))
+ config.substitutions.append(("%clangxxx-force-usm-" + libomptarget_target, \
+ "%clangxx %openmp_flags -fopenmp-force-usm %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
+ remove_suffix_if_present(libomptarget_target)))
config.substitutions.append(("%clang-" + libomptarget_target, \
"%clang %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
remove_suffix_if_present(libomptarget_target)))
diff --git a/openmp/libomptarget/test/offloading/force-usm.cpp b/openmp/libomptarget/test/offloading/force-usm.cpp
new file mode 100644
index 000000000000000..5bddecd5b4675d6
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/force-usm.cpp
@@ -0,0 +1,59 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: env LIBOMPTARGET_INFO=32 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=NO-USM
+//
+// RUN: %libomptarget-compilexxx-generic-force-usm
+// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=32 \
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=FORCE-USM
+//
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// clang-format on
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+
+int GI;
+#pragma omp declare target
+int *pGI;
+#pragma omp end declare target
+
+int main(void) {
+
+ GI = 0;
+ // Implicit mappings
+ int alpha = 1;
+ int beta[3] = {2, 5, 8};
+
+ // Require map clauses for non-USM execution
+ pGI = (int *)malloc(sizeof(int));
+ *pGI = 42;
+
+#pragma omp target map(pGI[ : 1], GI)
+ {
+ GI = 1 * alpha;
+ *pGI = 2 * beta[1];
+ }
+
+ assert(GI == 1);
+ assert(*pGI == 10);
+
+ printf("SUCCESS\n");
+
+ return 0;
+}
+
+// clang-format off
+// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
+// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12
+// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
+// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI
+// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
+// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=12
+// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
+// NO-USM-NEXT: SUCCESS
+
+// FORCE-USM: SUCCESS
+//
+// clang-format on
More information about the Openmp-commits
mailing list