[Openmp-commits] [clang] [openmp] [OpenMP][USM] Introduces -fopenmp-force-usm flag (PR #76571)

Jan Patrick Lehr via Openmp-commits openmp-commits at lists.llvm.org
Thu Jan 18 06:22:21 PST 2024


https://github.com/jplehr updated https://github.com/llvm/llvm-project/pull/76571

>From bf25a538e7c020efde557b595eba64b804cbb817 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Fri, 29 Dec 2023 04:32:24 -0500
Subject: [PATCH 1/4] [OpenMP][USM] Introduces -fopenmp-force-usm flag

This flag forces the compiler to generate code for OpenMP target regions
as if the user specified the #pragma omp requires unified_shared_memory
in each source file.

The option does not have a -fno-* friend since OpenMP requires the
unified_shared_memory clause to be present in all source files. Since
this flag does no harm if the clause is present, it can be used in
conjunction. My understanding is that USM should not be turned off
selectively, hence, no -fno- version.
---
 clang/include/clang/Basic/LangOptions.def | 1 +
 clang/include/clang/Driver/Options.td     | 4 ++++
 clang/lib/CodeGen/CGOpenMPRuntime.cpp     | 7 +++++++
 clang/lib/Driver/ToolChains/Clang.cpp     | 2 ++
 4 files changed, 14 insertions(+)

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 21abc346cf17ac3..81cf2ad9498a7f9 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -260,6 +260,7 @@ LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have mo
 LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
 LANGOPT(OpenMPNoNestedParallelism  , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
 LANGOPT(OpenMPOffloadMandatory  , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
+LANGOPT(OpenMPForceUSM     , 1, 0, "Enable OpenMP unified shared memory mode via compiler.")
 LANGOPT(NoGPULib  , 1, 0, "Indicate a build without the standard GPU libraries.")
 LANGOPT(RenderScript      , 1, 0, "RenderScript")
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2b93ddf033499cc..28290da438c62db 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3451,6 +3451,10 @@ def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<
   Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Do not create a host fallback if offloading to the device fails.">,
   MarshallingInfoFlag<LangOpts<"OpenMPOffloadMandatory">>;
+def fopenmp_force_usm : Flag<["-"], "fopenmp-force-usm">, Group<f_Group>,
+  Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Force behvaior as if the user specified pragma omp requires unified_shared_memory.">,
+  MarshallingInfoFlag<LangOpts<"OpenMPForceUSM">>;
 def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group<f_Group>,
   Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CLOption]>,
   HelpText<"Emit code that can be JIT compiled for OpenMP offloading. Implies -foffload-lto=full">;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index ea6645a39e83218..4855e7410a015aa 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1044,6 +1044,13 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
                                          ? CGM.getLangOpts().OMPHostIRFile
                                          : StringRef{});
   OMPBuilder.setConfig(Config);
+
+  // The user forces the compiler to behave as if omp requires
+  // unified_shared_memory was given.
+  if (CGM.getLangOpts().OpenMPForceUSM) {
+    HasRequiresUnifiedSharedMemory = true;
+    OMPBuilder.Config.setHasRequiresUnifiedSharedMemory(true);
+  }
 }
 
 void CGOpenMPRuntime::clear() {
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index acfa119805068d2..ffc24201ab2e0b5 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6382,6 +6382,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
       if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
         CmdArgs.push_back("-fopenmp-offload-mandatory");
+      if (Args.hasArg(options::OPT_fopenmp_force_usm))
+        CmdArgs.push_back("-fopenmp-force-usm");
       break;
     default:
       // By default, if Clang doesn't know how to generate useful OpenMP code

>From 11ad5633889870d897bfc4e77bc41b569e5ce539 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Wed, 12 Jul 2023 05:04:41 -0400
Subject: [PATCH 2/4] [OpenMP][USM] Adds test for -fopenmp-force-usm flag

This adds a basic test to check the correct generation of double
indirect access to declare target globals in USM mode vs non-USM mode.

Marked as XFAIL to first land test and then enable in subsequent patch.
---
 clang/test/OpenMP/force-usm.c | 74 +++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 clang/test/OpenMP/force-usm.c

diff --git a/clang/test/OpenMP/force-usm.c b/clang/test/OpenMP/force-usm.c
new file mode 100644
index 000000000000000..495e52f9f3d6257
--- /dev/null
+++ b/clang/test/OpenMP/force-usm.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 3
+// REQUIRES: amdgpu-registered-target
+// XFAIL: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-force-usm -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-force-usm -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-USM %s
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-DEFAULT %s
+// expected-no-diagnostics
+
+extern "C" void *malloc(unsigned int b);
+
+int GI;
+#pragma omp declare target
+int *pGI;
+#pragma omp end declare target
+
+int main(void) {
+
+  GI = 0;
+
+  pGI = (int *) malloc(sizeof(int));
+  *pGI = 42;
+
+#pragma omp target map(pGI[:1], GI)
+  {
+    GI = 1;
+    *pGI = 2;
+  }
+
+  return 0;
+}
+
+// CHECK-USM-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25
+// CHECK-USM-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-USM-NEXT:  entry:
+// CHECK-USM-NEXT:    [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-USM-NEXT:    [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
+// CHECK-USM-NEXT:    store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-USM-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-USM-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-USM-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-USM-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-USM:       user_code.entry:
+// CHECK-USM-NEXT:    store i32 1, ptr [[TMP0]], align 4
+// CHECK-USM-NEXT:    [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8
+// CHECK-USM-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CHECK-USM-NEXT:    store i32 2, ptr [[TMP3]], align 4
+// CHECK-USM-NEXT:    call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 1)
+// CHECK-USM-NEXT:    ret void
+// CHECK-USM:       worker.exit:
+// CHECK-USM-NEXT:    ret void
+//
+//
+// CHECK-DEFAULT-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25
+// CHECK-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-DEFAULT-NEXT:  entry:
+// CHECK-DEFAULT-NEXT:    [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-DEFAULT-NEXT:    [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
+// CHECK-DEFAULT-NEXT:    store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-DEFAULT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-DEFAULT-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 1, i1 true)
+// CHECK-DEFAULT-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-DEFAULT-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-DEFAULT:       user_code.entry:
+// CHECK-DEFAULT-NEXT:    store i32 1, ptr [[TMP0]], align 4
+// CHECK-DEFAULT-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @pGI to ptr), align 8
+// CHECK-DEFAULT-NEXT:    store i32 2, ptr [[TMP2]], align 4
+// CHECK-DEFAULT-NEXT:    call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 1)
+// CHECK-DEFAULT-NEXT:    ret void
+// CHECK-DEFAULT:       worker.exit:
+// CHECK-DEFAULT-NEXT:    ret void
+//

>From e093db3f1bc432654c2105430f2f647f6d2ab362 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 18 Jan 2024 08:45:21 -0500
Subject: [PATCH 3/4] [OpenMP][USM] Adds a runtime test for force USM flag

This runtime test checks for the occurence of data movement between host
and device. It does one run without the flag and one with the flag to
also see that both versions behave as expected.
---
 openmp/libomptarget/test/lit.cfg              |  8 ++++
 .../test/offloading/force-usm.cpp             | 45 +++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 openmp/libomptarget/test/offloading/force-usm.cpp

diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 19c5e5c45722271..3b9b9da4649dbe5 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -185,6 +185,8 @@ for libomptarget_target in config.libomptarget_all_targets:
             "%libomptarget-compile-and-run-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compilexx-generic",
             "%libomptarget-compilexx-" + libomptarget_target))
+        config.substitutions.append(("%libomptarget-compilexxx-generic-force-usm",
+            "%libomptarget-compilexxx-force-usm-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compile-generic",
             "%libomptarget-compile-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compile-fortran-generic",
@@ -242,6 +244,9 @@ for libomptarget_target in config.libomptarget_all_targets:
         config.substitutions.append(("%libomptarget-compilexx-" + \
             libomptarget_target, \
             "%clangxx-" + libomptarget_target + add_libraries(" %s -o %t")))
+        config.substitutions.append(("%libomptarget-compilexxx-force-usm-" +
+            libomptarget_target, "%clangxxx-force-usm-" + libomptarget_target + \
+                                     add_libraries(" %s -o %t")))
         config.substitutions.append(("%libomptarget-compile-" + \
             libomptarget_target, \
             "%clang-" + libomptarget_target + add_libraries(" %s -o %t")))
@@ -279,6 +284,9 @@ for libomptarget_target in config.libomptarget_all_targets:
         config.substitutions.append(("%clangxx-" + libomptarget_target, \
                                      "%clangxx %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
+        config.substitutions.append(("%clangxxx-force-usm-" + libomptarget_target, \
+                                     "%clangxx %openmp_flags -fopenmp-force-usm  %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
+                                     remove_suffix_if_present(libomptarget_target)))
         config.substitutions.append(("%clang-" + libomptarget_target, \
                                      "%clang %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
diff --git a/openmp/libomptarget/test/offloading/force-usm.cpp b/openmp/libomptarget/test/offloading/force-usm.cpp
new file mode 100644
index 000000000000000..f51f7e782e8d3ea
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/force-usm.cpp
@@ -0,0 +1,45 @@
+// RUN: %libomptarget-compilexx-generic
+// RUN: env LIBOMPTARGET_INFO=32 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=NO-USM
+//
+// RUN: %libomptarget-compilexxx-generic-force-usm
+// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=32 \
+// RUN:       %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=FORCE-USM
+//
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+
+#include <cstdio>
+#include <cstdlib>
+
+
+int GI;
+#pragma omp declare target
+int *pGI;
+#pragma omp end declare target
+
+int main(void) {
+
+  GI = 0;
+
+  pGI = (int *) malloc(sizeof(int));
+  *pGI = 42;
+
+#pragma omp target map(pGI[:1], GI)
+  {
+    GI = 1;
+    *pGI = 2;
+  }
+
+  printf("SUCCESS\n");
+
+  return 0;
+}
+
+// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
+// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
+// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI
+// NO-USM: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
+// NO-USM: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
+// NO-USM: SUCCESS
+
+// FORCE-USM: SUCCESS

>From 5551b08712e02fa2f86c9d321d5724665eb66844 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 18 Jan 2024 08:53:14 -0500
Subject: [PATCH 4/4] Format

---
 openmp/libomptarget/test/offloading/force-usm.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/openmp/libomptarget/test/offloading/force-usm.cpp b/openmp/libomptarget/test/offloading/force-usm.cpp
index f51f7e782e8d3ea..ff7bcfdaa44d9ef 100644
--- a/openmp/libomptarget/test/offloading/force-usm.cpp
+++ b/openmp/libomptarget/test/offloading/force-usm.cpp
@@ -1,3 +1,4 @@
+// clang-format off
 // RUN: %libomptarget-compilexx-generic
 // RUN: env LIBOMPTARGET_INFO=32 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=NO-USM
 //
@@ -7,11 +8,11 @@
 //
 // UNSUPPORTED: nvptx64-nvidia-cuda
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// clang-format on
 
 #include <cstdio>
 #include <cstdlib>
 
-
 int GI;
 #pragma omp declare target
 int *pGI;
@@ -21,10 +22,10 @@ int main(void) {
 
   GI = 0;
 
-  pGI = (int *) malloc(sizeof(int));
+  pGI = (int *)malloc(sizeof(int));
   *pGI = 42;
 
-#pragma omp target map(pGI[:1], GI)
+#pragma omp target map(pGI[ : 1], GI)
   {
     GI = 1;
     *pGI = 2;
@@ -35,6 +36,7 @@ int main(void) {
   return 0;
 }
 
+// clang-format off
 // NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
 // NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
 // NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI
@@ -43,3 +45,5 @@ int main(void) {
 // NO-USM: SUCCESS
 
 // FORCE-USM: SUCCESS
+//
+// clang-format on



More information about the Openmp-commits mailing list