[PATCH] D34784: [OpenMP] Add flag for specifying the target device architecture for OpenMP device offloading

Wed Jun 28 15:44:19 PDT 2017

gtbercea created this revision.

OpenMP has the ability to offload target regions to devices which may have different architectures.

A new -fopenmp-target-arch flag is introduced to specify the device architecture.

In this patch I use the new flag to specify the compute capability of the underlying NVIDIA architecture for the OpenMP offloading CUDA tool chain.

Only a host-offloading test is provided since full device offloading capability will only be available when D29654 <https://reviews.llvm.org/D29654> lands.


Repository:
  rL LLVM

https://reviews.llvm.org/D34784

Files:
  include/clang/Driver/Options.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload.c


Index: test/Driver/openmp-offload.c
===================================================================

--- test/Driver/openmp-offload.c
+++ test/Driver/openmp-offload.c
@@ -599,3 +599,11 @@
 // CHK-FOPENMP-IS-DEVICE: clang{{.*}}.i" {{.*}}" "-fopenmp-is-device"
 // CHK-FOPENMP-IS-DEVICE-NEXT: clang{{.*}}.bc" {{.*}}.i" "-fopenmp-is-device" "-fopenmp-host-ir-file-path"
 // CHK-FOPENMP-IS-DEVICE-NEXT: clang{{.*}}.s" {{.*}}.bc" "-fopenmp-is-device"
+
+/// ###########################################################################
+
+/// Check -march propagates compute capability to device offloading toolchain.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu -save-temps -no-canonical-prefixes -fopenmp-target-arch=sm_35 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-COMPUTE-CAPABILITY %s
+
+// CHK-COMPUTE-CAPABILITY: clang: warning: argument unused during compilation: '-fopenmp-target-arch=sm_35'
Index: lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -212,8 +212,20 @@
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
+  StringRef GPUArchName;
+  std::vector<std::string> GPUArchNames;
+  // If this is an OpenMP action we need to extract the device architecture from
+  // the -fopenmp-target-arch option.
+  if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
+    GPUArchNames = Args.getAllArgValues(options::OPT_fopenmp_target_arch_EQ);
+    assert(GPUArchNames.size() == 1 &&
+           "Exactly one GPU Arch required for ptxas.");
+    GPUArchName = GPUArchNames[0];
+  } else
+    GPUArchName = JA.getOffloadingArch();
+
   // Obtain architecture from the action.
-  CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch());
+  CudaArch gpu_arch = StringToCudaArch(GPUArchName);
   assert(gpu_arch != CudaArch::UNKNOWN &&
          "Device action expected to have an architecture.");
 
@@ -342,7 +354,9 @@
     Action::OffloadKind DeviceOffloadingKind) const {
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
 
-  StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
+  StringRef GpuArch = DriverArgs.getLastArgValue(
+      DeviceOffloadingKind == Action::OFK_OpenMP ?
+          options::OPT_fopenmp_target_arch_EQ : options::OPT_march_EQ);
   assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
   assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
           DeviceOffloadingKind == Action::OFK_Cuda) &&
@@ -364,7 +378,6 @@
   }
 
   std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
-
   if (LibDeviceFile.empty()) {
     getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
     return;
@@ -405,7 +418,7 @@
 
   // For OpenMP device offloading, append derived arguments. Make sure
   // flags are not duplicated.
-  // TODO: Append the compute capability.
+  // Also append the compute capability.
   if (DeviceOffloadKind == Action::OFK_OpenMP) {
     for (Arg *A : Args){
       bool IsDuplicate = false;
@@ -418,6 +431,14 @@
       if (!IsDuplicate)
         DAL->append(A);
     }
+
+    // Get the compute capability from the -fopenmp-target-arch flag.
+    // The default compute capability is sm_20 since this is a CUDA
+    // tool chain.
+    if (Args.getAllArgValues(options::OPT_fopenmp_target_arch_EQ).empty())
+      DAL->AddJoinedArg(nullptr,
+          Opts.getOption(options::OPT_fopenmp_target_arch_EQ), "sm_20");
+
     return DAL;
   }
 
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -1294,6 +1294,8 @@
   HelpText<"Specify comma-separated list of triples OpenMP offloading targets to be supported">;
 def fopenmp_dump_offload_linker_script : Flag<["-"], "fopenmp-dump-offload-linker-script">, Group<f_Group>, 
   Flags<[NoArgumentUnused]>;
+def fopenmp_target_arch_EQ : Joined<["-"], "fopenmp-target-arch=">, Flags<[DriverOption]>,
+  HelpText<"Pass a single target architecture (default for NVIDIA is sm_20) to be used by OpenMP device offloading.">;
 def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
 def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
 def force__cpusubtype__ALL : Flag<["-"], "force_cpusubtype_ALL">;


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D34784.104528.patch
Type: text/x-patch
Size: 4512 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20170628/2deec4da/attachment-0001.bin>