[clang] 8477a0d - [OpenMP] Allow compiling multiple target architectures with OpenMP

Fri May 6 13:57:27 PDT 2022

Author: Joseph Huber
Date: 2022-05-06T16:57:16-04:00
New Revision: 8477a0d769a0b877f14e3ec3fde576b6a3b173ce

URL: https://github.com/llvm/llvm-project/commit/8477a0d769a0b877f14e3ec3fde576b6a3b173ce
DIFF: https://github.com/llvm/llvm-project/commit/8477a0d769a0b877f14e3ec3fde576b6a3b173ce.diff

LOG: [OpenMP] Allow compiling multiple target architectures with OpenMP

This patch adds support for OpenMP to use the `--offload-arch` and
`--no-offload-arch` options. Traditionally, OpenMP has only supported
compiling for a single architecture via the `-Xopenmp-target` option.
Now we can pass in a bound architecture and use that if given, otherwise
we default to the value of the `-march` option as before.

Note that this only applies the basic support, the OpenMP target runtime
does not yet know how to choose between multiple architectures.
Additionally other parts of the offloading toolchain (e.g. LTO) require
the `-march` option, these should be worked out later.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D124721

Added: 
    

Modified: 
    clang/lib/Driver/Driver.cpp
    clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
    clang/lib/Driver/ToolChains/Cuda.cpp
    clang/test/Driver/amdgpu-openmp-toolchain-new.c
    clang/test/Driver/openmp-offload-gpu-new.c

Removed: 
    


################################################################################
diff  --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 5bba896d8c37..b7dd59c45f2d 100644

--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4214,17 +4214,20 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
 /// Returns the canonical name for the offloading architecture when using HIP or
 /// CUDA.
 static StringRef getCanonicalArchString(Compilation &C,
-                                        llvm::opt::DerivedArgList &Args,
+                                        const llvm::opt::DerivedArgList &Args,
                                         StringRef ArchStr,
-                                        Action::OffloadKind Kind) {
-  if (Kind == Action::OFK_Cuda) {
+                                        Action::OffloadKind Kind,
+                                        const ToolChain *TC) {
+  if (Kind == Action::OFK_Cuda ||
+      (Kind == Action::OFK_OpenMP && TC->getTriple().isNVPTX())) {
     CudaArch Arch = StringToCudaArch(ArchStr);
     if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) {
       C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
       return StringRef();
     }
     return Args.MakeArgStringRef(CudaArchToString(Arch));
-  } else if (Kind == Action::OFK_HIP) {
+  } else if (Kind == Action::OFK_HIP ||
+             (Kind == Action::OFK_OpenMP && TC->getTriple().isAMDGPU())) {
     llvm::StringMap<bool> Features;
     // getHIPOffloadTargetTriple() is known to return valid value as it has
     // been called successfully in the CreateOffloadingDeviceToolChains().
@@ -4239,7 +4242,8 @@ static StringRef getCanonicalArchString(Compilation &C,
     return Args.MakeArgStringRef(
         getCanonicalTargetID(Arch.getValue(), Features));
   }
-  return StringRef();
+  // If the input isn't CUDA or HIP just return the architecture.
+  return ArchStr;
 }
 
 /// Checks if the set offloading architectures does not conflict. Returns the
@@ -4259,12 +4263,8 @@ getConflictOffloadArchCombination(const llvm::DenseSet<StringRef> &Archs,
 /// This function returns a set of bound architectures, if there are no bound
 /// architctures we return a set containing only the empty string.
 static llvm::DenseSet<StringRef>
-getOffloadArchs(Compilation &C, llvm::opt::DerivedArgList &Args,
-                Action::OffloadKind Kind) {
-
-  // If this is OpenMP offloading we don't use a bound architecture.
-  if (Kind == Action::OFK_OpenMP)
-    return llvm::DenseSet<StringRef>{StringRef()};
+getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args,
+                Action::OffloadKind Kind, const ToolChain *TC) {
 
   // --offload and --offload-arch options are mutually exclusive.
   if (Args.hasArgNoClaim(options::OPT_offload_EQ) &&
@@ -4280,12 +4280,12 @@ getOffloadArchs(Compilation &C, llvm::opt::DerivedArgList &Args,
   llvm::DenseSet<StringRef> Archs;
   for (auto &Arg : Args) {
     if (Arg->getOption().matches(options::OPT_offload_arch_EQ)) {
-      Archs.insert(getCanonicalArchString(C, Args, Arg->getValue(), Kind));
+      Archs.insert(getCanonicalArchString(C, Args, Arg->getValue(), Kind, TC));
     } else if (Arg->getOption().matches(options::OPT_no_offload_arch_EQ)) {
       if (Arg->getValue() == StringRef("all"))
         Archs.clear();
       else
-        Archs.erase(getCanonicalArchString(C, Args, Arg->getValue(), Kind));
+        Archs.erase(getCanonicalArchString(C, Args, Arg->getValue(), Kind, TC));
     }
   }
 
@@ -4301,6 +4301,11 @@ getOffloadArchs(Compilation &C, llvm::opt::DerivedArgList &Args,
       Archs.insert(CudaArchToString(CudaArch::CudaDefault));
     else if (Kind == Action::OFK_HIP)
       Archs.insert(CudaArchToString(CudaArch::HIPDefault));
+    else if (Kind == Action::OFK_OpenMP)
+      Archs.insert(StringRef());
+  } else {
+    Args.ClaimAllArgs(options::OPT_offload_arch_EQ);
+    Args.ClaimAllArgs(options::OPT_no_offload_arch_EQ);
   }
 
   return Archs;
@@ -4346,7 +4351,8 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     // Get the product of all bound architectures and toolchains.
     SmallVector<std::pair<const ToolChain *, StringRef>> TCAndArchs;
     for (const ToolChain *TC : ToolChains)
-      for (StringRef Arch : getOffloadArchs(C, Args, Kind))
+      for (StringRef Arch : getOffloadArchs(
+               C, C.getArgsForToolChain(TC, "generic", Kind), Kind, TC))
         TCAndArchs.push_back(std::make_pair(TC, Arch));
 
     for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I)
@@ -4375,9 +4381,9 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
           HostAction->setCannotBeCollapsedWithNextDependentAction();
           OffloadAction::HostDependence HDep(
               *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
-              /*BoundArch=*/nullptr, Kind);
+              TCAndArch->second.data(), Kind);
           OffloadAction::DeviceDependences DDep;
-          DDep.add(*A, *TCAndArch->first, /*BoundArch=*/nullptr, Kind);
+          DDep.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);
           A = C.MakeAction<OffloadAction>(HDep, DDep);
         } else if (isa<AssembleJobAction>(A) && Kind == Action::OFK_Cuda) {
           // The Cuda toolchain uses fatbinary as the linker phase to bundle the

diff  --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index 998bb0b9f7c9..fa9b8b64029d 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -307,9 +307,10 @@ llvm::opt::DerivedArgList *AMDGPUOpenMPToolChain::TranslateArgs(
       if (!llvm::is_contained(*DAL, A))
         DAL->append(A);
 
-    std::string Arch = DAL->getLastArgValue(options::OPT_march_EQ).str();
-    if (Arch.empty()) {
-      checkSystemForAMDGPU(Args, *this, Arch);
+    if (!DAL->hasArg(options::OPT_march_EQ)) {
+      std::string Arch = BoundArch.str();
+      if (BoundArch.empty())
+        checkSystemForAMDGPU(Args, *this, Arch);
       DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), Arch);
     }
 

diff  --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 6103c42bf754..666f568187b2 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -847,10 +847,10 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
       if (!llvm::is_contained(*DAL, A))
         DAL->append(A);
 
-    StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ);
-    if (Arch.empty())
+    if (!DAL->hasArg(options::OPT_march_EQ))
       DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
-                        CLANG_OPENMP_NVPTX_DEFAULT_ARCH);
+                        !BoundArch.empty() ? BoundArch
+                                           : CLANG_OPENMP_NVPTX_DEFAULT_ARCH);
 
     return DAL;
   }

diff  --git a/clang/test/Driver/amdgpu-openmp-toolchain-new.c b/clang/test/Driver/amdgpu-openmp-toolchain-new.c
index a3213656e8c3..bfb681084f27 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain-new.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain-new.c
@@ -3,6 +3,9 @@
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
 // RUN:          -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
 // RUN:   | FileCheck %s
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa \
+// RUN:          --offload-arch=gfx906 --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
+// RUN:   | FileCheck %s
 
 // verify the tools invocations
 // CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-llvm-bc"{{.*}}"-x" "c"
@@ -34,6 +37,7 @@
 // CHECK-NOGPULIB-NOT: "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}libomptarget-amdgpu-gfx803.bc"{{.*}}
 
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa --offload-arch=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-BINDINGS
 // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"], output: "[[HOST_BC:.*]]"
 // CHECK-BINDINGS: "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_BC:.*]]"
 // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_BC]]"], output: "[[HOST_OBJ:.*]]"

diff  --git a/clang/test/Driver/openmp-offload-gpu-new.c b/clang/test/Driver/openmp-offload-gpu-new.c
index 7281ba71c94a..782f00830987 100644
--- a/clang/test/Driver/openmp-offload-gpu-new.c
+++ b/clang/test/Driver/openmp-offload-gpu-new.c
@@ -10,6 +10,10 @@
 // RUN:          -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 \
 // RUN:          --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc %s 2>&1 \
 // RUN:   | FileCheck %s
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:          --offload-arch=sm_52 \
+// RUN:          --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc %s 2>&1 \
+// RUN:   | FileCheck %s
 
 // verify the tools invocations
 // CHECK: "-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-emit-llvm-bc"{{.*}}"-x" "c"
@@ -40,6 +44,27 @@
 // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_OBJ]]"], output: "[[HOST_OBJ:.*]]"
 // CHECK-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
 
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda --offload-arch=sm_52 --offload-arch=sm_70 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-ARCH-BINDINGS
+// CHECK-ARCH-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"], output: "[[HOST_BC:.*]]"
+// CHECK-ARCH-BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_BC_SM_52:.*]]"
+// CHECK-ARCH-BINDINGS: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_BC_SM_52]]"], output: "[[DEVICE_OBJ_SM_52:.*]]"
+// CHECK-ARCH-BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[DEVICE_BC_SM_70:.*]]"
+// CHECK-ARCH-BINDINGS: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[DEVICE_BC_SM_70]]"], output: "[[DEVICE_OBJ_SM_70:.*]]"
+// CHECK-ARCH-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[DEVICE_OBJ_SM_52]]", "[[DEVICE_OBJ_SM_70]]"], output: "[[HOST_OBJ:.*]]"
+// CHECK-ARCH-BINDINGS: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp \
+// RUN:     -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -Xopenmp-target=nvptx64-nvidia-cuda --offload-arch=sm_70 \
+// RUN:     -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa --offload-arch=gfx908  \
+// RUN:     -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NVIDIA-AMDGPU
+
+// CHECK-NVIDIA-AMDGPU: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]"
+// CHECK-NVIDIA-AMDGPU: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[NVIDIA_PTX:.+]]"
+// CHECK-NVIDIA-AMDGPU: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[NVIDIA_PTX]]"], output: "[[NVIDIA_CUBIN:.+]]"
+// CHECK-NVIDIA-AMDGPU: "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[AMD_BC:.+]]"
+// CHECK-NVIDIA-AMDGPU: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[NVIDIA_CUBIN]]", "[[AMD_BC]]"], output: "[[HOST_OBJ:.+]]"
+// CHECK-NVIDIA-AMDGPU: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
 // CHECK-EMIT-LLVM-IR: "-cc1"{{.*}}"-triple" "nvptx64-nvidia-cuda"{{.*}}"-emit-llvm"