r282865 - [CUDA][OpenMP] Add a generic offload action builder

Fri Sep 30 08:34:19 PDT 2016

Author: sfantao
Date: Fri Sep 30 10:34:19 2016
New Revision: 282865

URL: http://llvm.org/viewvc/llvm-project?rev=282865&view=rev
Log:
[CUDA][OpenMP] Add a generic offload action builder

Summary:
This patch proposes a new class to generate and record action dependences related with offloading. The builder provides three main functionalities:
- Add device dependences to host actions.
- Add host dependence to device actions.
- Register device top-level actions.

The constructor of the builder detect the programming models that should be supported, and generates a specialized builder for each. If a new programming model is to be added in the future, only a new specialized builder has to be implemented. 

When the specialized builder is generated, it produces programming-model-specific diagnostics.

A CUDA specialized builder is proposed in the patch that mostly consists of the partition of the current  `buildCudaAction` by the three different functionalities.

Reviewers: tra, echristo, ABataev, jlebar, hfinkel

Subscribers: Hahnfeld, whchung, guansong, jlebar, mehdi_amini, andreybokhanko, tcramer, mkuron, cfe-commits, arpith-jacob, carlo.bertolli, caomhin

Differential Revision: https://reviews.llvm.org/D18172

Modified:
    cfe/trunk/include/clang/Driver/Compilation.h
    cfe/trunk/lib/Driver/Driver.cpp
    cfe/trunk/lib/Driver/Types.cpp
    cfe/trunk/test/Driver/cuda-bindings.cu
    cfe/trunk/test/Driver/cuda-phases.cu

Modified: cfe/trunk/include/clang/Driver/Compilation.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/Compilation.h?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================

--- cfe/trunk/include/clang/Driver/Compilation.h (original)
+++ cfe/trunk/include/clang/Driver/Compilation.h Fri Sep 30 10:34:19 2016
@@ -115,6 +115,12 @@ public:
     return OrderedOffloadingToolchains.equal_range(Kind);
   }
 
+  /// Return true if an offloading tool chain of a given kind exists.
+  template <Action::OffloadKind Kind> bool hasOffloadToolChain() const {
+    return OrderedOffloadingToolchains.find(Kind) !=
+           OrderedOffloadingToolchains.end();
+  }
+
   /// Return an offload toolchain of the provided kind. Only one is expected to
   /// exist.
   template <Action::OffloadKind Kind>

Modified: cfe/trunk/lib/Driver/Driver.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Driver.cpp?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Driver.cpp (original)
+++ cfe/trunk/lib/Driver/Driver.cpp Fri Sep 30 10:34:19 2016
@@ -1400,139 +1400,536 @@ void Driver::BuildInputs(const ToolChain
   }
 }
 
-// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE
-// input action and then wraps each in CudaDeviceAction paired with
-// appropriate GPU arch name. In case of partial (i.e preprocessing
-// only) or device-only compilation, each device action is added to /p
-// Actions and /p Current is released. Otherwise the function creates
-// and returns a new CudaHostAction which wraps /p Current and device
-// side actions.
-static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
-                                const Arg *InputArg, Action *HostAction,
-                                ActionList &Actions) {
-  Arg *PartialCompilationArg = Args.getLastArg(
-      options::OPT_cuda_host_only, options::OPT_cuda_device_only,
-      options::OPT_cuda_compile_host_device);
-  bool CompileHostOnly =
-      PartialCompilationArg &&
-      PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only);
-  bool CompileDeviceOnly =
-      PartialCompilationArg &&
-      PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only);
-  const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
-  assert(HostTC && "No toolchain for host compilation.");
-  if (HostTC->getTriple().isNVPTX()) {
-    // We do not support targeting NVPTX for host compilation. Throw
-    // an error and abort pipeline construction early so we don't trip
-    // asserts that assume device-side compilation.
-    C.getDriver().Diag(diag::err_drv_cuda_nvptx_host);
-    return nullptr;
-  }
-
-  if (CompileHostOnly) {
-    OffloadAction::HostDependence HDep(*HostAction, *HostTC,
-                                       /*BoundArch=*/nullptr, Action::OFK_Cuda);
-    return C.MakeAction<OffloadAction>(HDep);
-  }
-
-  // Collect all cuda_gpu_arch parameters, removing duplicates.
-  SmallVector<CudaArch, 4> GpuArchList;
-  llvm::SmallSet<CudaArch, 4> GpuArchs;
-  for (Arg *A : Args) {
-    if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
-      continue;
-    A->claim();
+namespace {
+/// Provides a convenient interface for different programming models to generate
+/// the required device actions.
+class OffloadingActionBuilder final {
+  /// Flag used to trace errors in the builder.
+  bool IsValid = false;
+
+  /// The compilation that is using this builder.
+  Compilation &C;
+
+  /// The derived arguments associated with this builder.
+  DerivedArgList &Args;
+
+  /// Map between an input argument and the offload kinds used to process it.
+  std::map<const Arg *, unsigned> InputArgToOffloadKindMap;
+
+  /// Builder interface. It doesn't build anything or keep any state.
+  class DeviceActionBuilder {
+  public:
+    typedef llvm::SmallVector<phases::ID, phases::MaxNumberOfPhases> PhasesTy;
+
+    enum ActionBuilderReturnCode {
+      // The builder acted successfully on the current action.
+      ABRT_Success,
+      // The builder didn't have to act on the current action.
+      ABRT_Inactive,
+      // The builder was successful and requested the host action to not be
+      // generated.
+      ABRT_Ignore_Host,
+    };
+
+  protected:
+    /// Compilation associated with this builder.
+    Compilation &C;
+
+    /// Tool chains associated with this builder. The same programming
+    /// model may have associated one or more tool chains.
+    SmallVector<const ToolChain *, 2> ToolChains;
+
+    /// The derived arguments associated with this builder.
+    DerivedArgList &Args;
+
+    /// The inputs associated with this builder.
+    const Driver::InputList &Inputs;
+
+    /// The associated offload kind.
+    Action::OffloadKind AssociatedOffloadKind = Action::OFK_None;
+
+  public:
+    DeviceActionBuilder(Compilation &C, DerivedArgList &Args,
+                        const Driver::InputList &Inputs,
+                        Action::OffloadKind AssociatedOffloadKind)
+        : C(C), Args(Args), Inputs(Inputs),
+          AssociatedOffloadKind(AssociatedOffloadKind) {}
+    virtual ~DeviceActionBuilder() {}
+
+    /// Fill up the array \a DA with all the device dependences that should be
+    /// added to the provided host action \a HostAction. By default it is
+    /// inactive.
+    virtual ActionBuilderReturnCode
+    getDeviceDepences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase,
+                      phases::ID FinalPhase, PhasesTy &Phases) {
+      return ABRT_Inactive;
+    }
+
+    /// Update the state to include the provided host action \a HostAction as a
+    /// dependency of the current device action. By default it is inactive.
+    virtual ActionBuilderReturnCode addDeviceDepences(Action *HostAction) {
+      return ABRT_Inactive;
+    }
+
+    /// Append top level actions generated by the builder. Return true if errors
+    /// were found.
+    virtual void appendTopLevelActions(ActionList &AL) {}
+
+    /// Append linker actions generated by the builder. Return true if errors
+    /// were found.
+    virtual void appendLinkDependences(OffloadAction::DeviceDependences &DA) {}
+
+    /// Initialize the builder. Return true if any initialization errors are
+    /// found.
+    virtual bool initialize() { return false; }
+
+    /// Return true if this builder is valid. We have a valid builder if we have
+    /// associated device tool chains.
+    bool isValid() { return !ToolChains.empty(); }
+
+    /// Return the associated offload kind.
+    Action::OffloadKind getAssociatedOffloadKind() {
+      return AssociatedOffloadKind;
+    }
+  };
+
+  /// \brief CUDA action builder. It injects device code in the host backend
+  /// action.
+  class CudaActionBuilder final : public DeviceActionBuilder {
+    /// Flags to signal if the user requested host-only or device-only
+    /// compilation.
+    bool CompileHostOnly = false;
+    bool CompileDeviceOnly = false;
+
+    /// List of GPU architectures to use in this compilation.
+    SmallVector<CudaArch, 4> GpuArchList;
+
+    /// The CUDA actions for the current input.
+    ActionList CudaDeviceActions;
+
+    /// The CUDA fat binary if it was generated for the current input.
+    Action *CudaFatBinary = nullptr;
+
+    /// Flag that is set to true if this builder acted on the current input.
+    bool IsActive = false;
+
+  public:
+    CudaActionBuilder(Compilation &C, DerivedArgList &Args,
+                      const Driver::InputList &Inputs)
+        : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {}
+
+    ActionBuilderReturnCode
+    getDeviceDepences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase,
+                      phases::ID FinalPhase, PhasesTy &Phases) override {
+      if (!IsActive)
+        return ABRT_Inactive;
+
+      // If we don't have more CUDA actions, we don't have any dependences to
+      // create for the host.
+      if (CudaDeviceActions.empty())
+        return ABRT_Success;
+
+      assert(CudaDeviceActions.size() == GpuArchList.size() &&
+             "Expecting one action per GPU architecture.");
+      assert(!CompileHostOnly &&
+             "Not expecting CUDA actions in host-only compilation.");
+
+      // If we are generating code for the device or we are in a backend phase,
+      // we attempt to generate the fat binary. We compile each arch to ptx and
+      // assemble to cubin, then feed the cubin *and* the ptx into a device
+      // "link" action, which uses fatbinary to combine these cubins into one
+      // fatbin.  The fatbin is then an input to the host action if not in
+      // device-only mode.
+      if (CompileDeviceOnly || CurPhase == phases::Backend) {
+        ActionList DeviceActions;
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          // Produce the device action from the current phase up to the assemble
+          // phase.
+          for (auto Ph : Phases) {
+            // Skip the phases that were already dealt with.
+            if (Ph < CurPhase)
+              continue;
+            // We have to be consistent with the host final phase.
+            if (Ph > FinalPhase)
+              break;
+
+            CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
+                C, Args, Ph, CudaDeviceActions[I]);
+
+            if (Ph == phases::Assemble)
+              break;
+          }
+
+          // If we didn't reach the assemble phase, we can't generate the fat
+          // binary. We don't need to generate the fat binary if we are not in
+          // device-only mode.
+          if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
+              CompileDeviceOnly)
+            continue;
+
+          Action *AssembleAction = CudaDeviceActions[I];
+          assert(AssembleAction->getType() == types::TY_Object);
+          assert(AssembleAction->getInputs().size() == 1);
+
+          Action *BackendAction = AssembleAction->getInputs()[0];
+          assert(BackendAction->getType() == types::TY_PP_Asm);
+
+          for (auto &A : {AssembleAction, BackendAction}) {
+            OffloadAction::DeviceDependences DDep;
+            DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
+                     Action::OFK_Cuda);
+            DeviceActions.push_back(
+                C.MakeAction<OffloadAction>(DDep, A->getType()));
+          }
+        }
+
+        // We generate the fat binary if we have device input actions.
+        if (!DeviceActions.empty()) {
+          CudaFatBinary =
+              C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
+          if (!CompileDeviceOnly) {
+            DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+                   Action::OFK_Cuda);
+            // Clear the fat binary, it is already a dependence to an host
+            // action.
+            CudaFatBinary = nullptr;
+          }
+
+          // Remove the CUDA actions as they are already connected to an host
+          // action or fat binary.
+          CudaDeviceActions.clear();
+        }
 
-    const auto &ArchStr = A->getValue();
-    CudaArch Arch = StringToCudaArch(ArchStr);
-    if (Arch == CudaArch::UNKNOWN)
-      C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
-    else if (GpuArchs.insert(Arch).second)
-      GpuArchList.push_back(Arch);
-  }
-
-  // Default to sm_20 which is the lowest common denominator for supported GPUs.
-  // sm_20 code should work correctly, if suboptimally, on all newer GPUs.
-  if (GpuArchList.empty())
-    GpuArchList.push_back(CudaArch::SM_20);
-
-  // Replicate inputs for each GPU architecture.
-  Driver::InputList CudaDeviceInputs;
-  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-    CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
-
-  // Build actions for all device inputs.
-  ActionList CudaDeviceActions;
-  C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions);
-  assert(GpuArchList.size() == CudaDeviceActions.size() &&
-         "Failed to create actions for all devices");
-
-  // Check whether any of device actions stopped before they could generate PTX.
-  bool PartialCompilation =
-      llvm::any_of(CudaDeviceActions, [](const Action *a) {
-        return a->getKind() != Action::AssembleJobClass;
-      });
-
-  const ToolChain *CudaTC = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
-
-  // Figure out what to do with device actions -- pass them as inputs to the
-  // host action or run each of them independently.
-  if (PartialCompilation || CompileDeviceOnly) {
-    // In case of partial or device-only compilation results of device actions
-    // are not consumed by the host action device actions have to be added to
-    // top-level actions list with AtTopLevel=true and run independently.
-
-    // -o is ambiguous if we have more than one top-level action.
-    if (Args.hasArg(options::OPT_o) &&
-        (!CompileDeviceOnly || GpuArchList.size() > 1)) {
-      C.getDriver().Diag(
-          clang::diag::err_drv_output_argument_with_multiple_files);
-      return nullptr;
+        // We avoid creating host action in device-only mode.
+        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
+      }
+
+      assert(CurPhase < phases::Backend && "Generating single CUDA "
+                                           "instructions should only occur "
+                                           "before the backend phase!");
+
+      // By default, we produce an action for each device arch.
+      for (Action *&A : CudaDeviceActions)
+        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
+
+      return ABRT_Success;
+    }
+
+    ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
+      // While generating code for CUDA, we only depend on the host input action
+      // to trigger the creation of all the CUDA device actions.
+
+      // If we are dealing with an input action, replicate it for each GPU
+      // architecture. If we are in host-only mode we return 'success' so that
+      // the host uses the CUDA offload kind.
+      if (auto *IA = dyn_cast<InputAction>(HostAction)) {
+        assert(!GpuArchList.empty() &&
+               "We should have at least one GPU architecture.");
+
+        // If the host input is not CUDA, we don't need to bother about this
+        // input.
+        if (IA->getType() != types::TY_CUDA) {
+          // The builder will ignore this input.
+          IsActive = false;
+          return ABRT_Inactive;
+        }
+
+        // Set the flag to true, so that the builder acts on the current input.
+        IsActive = true;
+
+        if (CompileHostOnly)
+          return ABRT_Success;
+
+        // Replicate inputs for each GPU architecture.
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
+          CudaDeviceActions.push_back(C.MakeAction<InputAction>(
+              IA->getInputArg(), types::TY_CUDA_DEVICE));
+
+        return ABRT_Success;
+      }
+
+      return IsActive ? ABRT_Success : ABRT_Inactive;
     }
 
-    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
-      OffloadAction::DeviceDependences DDep;
-      DDep.add(*CudaDeviceActions[I], *CudaTC, CudaArchToString(GpuArchList[I]),
-               Action::OFK_Cuda);
-      Actions.push_back(
-          C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType()));
+    void appendTopLevelActions(ActionList &AL) override {
+      // Utility to append actions to the top level list.
+      auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
+        OffloadAction::DeviceDependences Dep;
+        Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
+                Action::OFK_Cuda);
+        AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
+      };
+
+      // If we have a fat binary, add it to the list.
+      if (CudaFatBinary) {
+        AddTopLevel(CudaFatBinary, CudaArch::UNKNOWN);
+        CudaDeviceActions.clear();
+        CudaFatBinary = nullptr;
+        return;
+      }
+
+      if (CudaDeviceActions.empty())
+        return;
+
+      // If we have CUDA actions at this point, that's because we have a have
+      // partial compilation, so we should have an action for each GPU
+      // architecture.
+      assert(CudaDeviceActions.size() == GpuArchList.size() &&
+             "Expecting one action per GPU architecture.");
+      assert(ToolChains.size() == 1 &&
+             "Expecting to have a sing CUDA toolchain.");
+      for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
+        AddTopLevel(CudaDeviceActions[I], GpuArchList[I]);
+
+      CudaDeviceActions.clear();
+    }
+
+    bool initialize() override {
+      // We don't need to support CUDA.
+      if (!C.hasOffloadToolChain<Action::OFK_Cuda>())
+        return false;
+
+      const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
+      assert(HostTC && "No toolchain for host compilation.");
+      if (HostTC->getTriple().isNVPTX()) {
+        // We do not support targeting NVPTX for host compilation. Throw
+        // an error and abort pipeline construction early so we don't trip
+        // asserts that assume device-side compilation.
+        C.getDriver().Diag(diag::err_drv_cuda_nvptx_host);
+        return true;
+      }
+
+      ToolChains.push_back(C.getSingleOffloadToolChain<Action::OFK_Cuda>());
+
+      Arg *PartialCompilationArg = Args.getLastArg(
+          options::OPT_cuda_host_only, options::OPT_cuda_device_only,
+          options::OPT_cuda_compile_host_device);
+      CompileHostOnly = PartialCompilationArg &&
+                        PartialCompilationArg->getOption().matches(
+                            options::OPT_cuda_host_only);
+      CompileDeviceOnly = PartialCompilationArg &&
+                          PartialCompilationArg->getOption().matches(
+                              options::OPT_cuda_device_only);
+
+      // Collect all cuda_gpu_arch parameters, removing duplicates.
+      llvm::SmallSet<CudaArch, 4> GpuArchs;
+      bool Error = false;
+      for (Arg *A : Args) {
+        if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
+          continue;
+        A->claim();
+
+        const auto &ArchStr = A->getValue();
+        CudaArch Arch = StringToCudaArch(ArchStr);
+        if (Arch == CudaArch::UNKNOWN) {
+          C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
+          Error = true;
+        } else if (GpuArchs.insert(Arch).second)
+          GpuArchList.push_back(Arch);
+      }
+
+      // Default to sm_20 which is the lowest common denominator for supported
+      // GPUs.
+      // sm_20 code should work correctly, if suboptimally, on all newer GPUs.
+      if (GpuArchList.empty())
+        GpuArchList.push_back(CudaArch::SM_20);
+
+      return Error;
     }
-    // Kill host action in case of device-only compilation.
-    if (CompileDeviceOnly)
+  };
+
+  /// Add the implementation for other specialized builders here.
+
+  /// Specialized builders being used by this offloading action builder.
+  SmallVector<DeviceActionBuilder *, 4> SpecializedBuilders;
+
+public:
+  OffloadingActionBuilder(Compilation &C, DerivedArgList &Args,
+                          const Driver::InputList &Inputs)
+      : C(C), Args(Args) {
+    // Create a specialized builder for each device toolchain.
+
+    IsValid = true;
+
+    // Create a specialized builder for CUDA.
+    SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs));
+
+    //
+    // TODO: Build other specialized builders here.
+    //
+
+    // Initialize all the builders, keeping track of errors.
+    for (auto *SB : SpecializedBuilders)
+      IsValid = IsValid && !SB->initialize();
+  }
+
+  ~OffloadingActionBuilder() {
+    for (auto *SB : SpecializedBuilders)
+      delete SB;
+  }
+
+  /// Generate an action that adds device dependences (if any) to a host action.
+  /// If no device dependence actions exist, just return the host action \a
+  /// HostAction. If an error is found or if no builder requires the host action
+  /// to be generated, return nullptr.
+  Action *
+  addDeviceDependencesToHostAction(Action *HostAction, const Arg *InputArg,
+                                   phases::ID CurPhase, phases::ID FinalPhase,
+                                   DeviceActionBuilder::PhasesTy &Phases) {
+    if (!IsValid)
       return nullptr;
-    return HostAction;
+
+    if (SpecializedBuilders.empty())
+      return HostAction;
+
+    assert(HostAction && "Invalid host action!");
+
+    OffloadAction::DeviceDependences DDeps;
+    // Check if all the programming models agree we should not emit the host
+    // action. Also, keep track of the offloading kinds employed.
+    auto &OffloadKind = InputArgToOffloadKindMap[InputArg];
+    unsigned InactiveBuilders = 0u;
+    unsigned IgnoringBuilders = 0u;
+    for (auto *SB : SpecializedBuilders) {
+      if (!SB->isValid()) {
+        ++InactiveBuilders;
+        continue;
+      }
+
+      auto RetCode = SB->getDeviceDepences(DDeps, CurPhase, FinalPhase, Phases);
+
+      // If the builder explicitly says the host action should be ignored,
+      // we need to increment the variable that tracks the builders that request
+      // the host object to be ignored.
+      if (RetCode == DeviceActionBuilder::ABRT_Ignore_Host)
+        ++IgnoringBuilders;
+
+      // Unless the builder was inactive for this action, we have to record the
+      // offload kind because the host will have to use it.
+      if (RetCode != DeviceActionBuilder::ABRT_Inactive)
+        OffloadKind |= SB->getAssociatedOffloadKind();
+    }
+
+    // If all builders agree that the host object should be ignored, just return
+    // nullptr.
+    if (IgnoringBuilders &&
+        SpecializedBuilders.size() == (InactiveBuilders + IgnoringBuilders))
+      return nullptr;
+
+    if (DDeps.getActions().empty())
+      return HostAction;
+
+    // We have dependences we need to bundle together. We use an offload action
+    // for that.
+    OffloadAction::HostDependence HDep(
+        *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+        /*BoundArch=*/nullptr, DDeps);
+    return C.MakeAction<OffloadAction>(HDep, DDeps);
+  }
+
+  /// Generate an action that adds a host dependence to a device action. The
+  /// results will be kept in this action builder. Return true if an error was
+  /// found.
+  bool addHostDependenceToDeviceActions(Action *HostAction,
+                                        const Arg *InputArg) {
+    if (!IsValid)
+      return true;
+
+    assert(HostAction && "Invalid host action!");
+
+    // Register the offload kinds that are used.
+    auto &OffloadKind = InputArgToOffloadKindMap[InputArg];
+    for (auto *SB : SpecializedBuilders) {
+      if (!SB->isValid())
+        continue;
+
+      auto RetCode = SB->addDeviceDepences(HostAction);
+
+      // Host dependences for device actions are not compatible with that same
+      // action being ignored.
+      assert(RetCode != DeviceActionBuilder::ABRT_Ignore_Host &&
+             "Host dependence not expected to be ignored.!");
+
+      // Unless the builder was inactive for this action, we have to record the
+      // offload kind because the host will have to use it.
+      if (RetCode != DeviceActionBuilder::ABRT_Inactive)
+        OffloadKind |= SB->getAssociatedOffloadKind();
+    }
+
+    return false;
   }
 
-  // If we're not a partial or device-only compilation, we compile each arch to
-  // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device
-  // "link" action, which uses fatbinary to combine these cubins into one
-  // fatbin.  The fatbin is then an input to the host compilation.
-  ActionList DeviceActions;
-  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
-    Action* AssembleAction = CudaDeviceActions[I];
-    assert(AssembleAction->getType() == types::TY_Object);
-    assert(AssembleAction->getInputs().size() == 1);
-
-    Action* BackendAction = AssembleAction->getInputs()[0];
-    assert(BackendAction->getType() == types::TY_PP_Asm);
-
-    for (auto &A : {AssembleAction, BackendAction}) {
-      OffloadAction::DeviceDependences DDep;
-      DDep.add(*A, *CudaTC, CudaArchToString(GpuArchList[I]), Action::OFK_Cuda);
-      DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
-    }
-  }
-  auto FatbinAction =
-      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
-
-  // Return a new host action that incorporates original host action and all
-  // device actions.
-  OffloadAction::HostDependence HDep(*HostAction, *HostTC,
-                                     /*BoundArch=*/nullptr, Action::OFK_Cuda);
-  OffloadAction::DeviceDependences DDep;
-  DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda);
-  return C.MakeAction<OffloadAction>(HDep, DDep);
-}
+  /// Add the offloading top level actions to the provided action list.
+  bool appendTopLevelActions(ActionList &AL, Action *HostAction,
+                             const Arg *InputArg) {
+    auto NumActions = AL.size();
+
+    for (auto *SB : SpecializedBuilders) {
+      if (!SB->isValid())
+        continue;
+      SB->appendTopLevelActions(AL);
+    }
+
+    assert(NumActions <= AL.size() && "Expecting more actions, not less!");
+
+    // Propagate to the current host action (if any) the offload information
+    // associated with the current input.
+    if (HostAction)
+      HostAction->propagateHostOffloadInfo(InputArgToOffloadKindMap[InputArg],
+                                           /*BoundArch=*/nullptr);
+
+    // If any action is added by the builders, -o is ambiguous if we have more
+    // than one top-level action.
+    if (NumActions < AL.size() && Args.hasArg(options::OPT_o) &&
+        AL.size() > 1) {
+      C.getDriver().Diag(
+          clang::diag::err_drv_output_argument_with_multiple_files);
+      return true;
+    }
+
+    return false;
+  }
+
+  /// Processes the host linker action. This currently consists of replacing it
+  /// with an offload action if there are device link objects and propagate to
+  /// the host action all the offload kinds used in the current compilation. The
+  /// resulting action is returned.
+  Action *processHostLinkAction(Action *HostAction) {
+    // Add all the dependences from the device linking actions.
+    OffloadAction::DeviceDependences DDeps;
+    for (auto *SB : SpecializedBuilders) {
+      if (!SB->isValid())
+        continue;
+
+      SB->appendLinkDependences(DDeps);
+    }
+
+    // Calculate all the offload kinds used in the current compilation.
+    unsigned ActiveOffloadKinds = 0u;
+    for (auto &I : InputArgToOffloadKindMap)
+      ActiveOffloadKinds |= I.second;
+
+    // If we don't have device dependencies, we don't have to create an offload
+    // action.
+    if (DDeps.getActions().empty()) {
+      // Propagate all the active kinds to host action. Given that it is a link
+      // action it is assumed to depend on all actions generated so far.
+      HostAction->propagateHostOffloadInfo(ActiveOffloadKinds,
+                                           /*BoundArch=*/nullptr);
+      return HostAction;
+    }
+
+    // Create the offload action with all dependences. When an offload action
+    // is created the kinds are propagated to the host action, so we don't have
+    // to do that explicitely here.
+    OffloadAction::HostDependence HDep(
+        *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+        /*BoundArch*/ nullptr, ActiveOffloadKinds);
+    return C.MakeAction<OffloadAction>(HDep, DDeps);
+  }
+};
+} // anonymous namespace.
 
 void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
                           const InputList &Inputs, ActionList &Actions) const {
@@ -1640,8 +2037,8 @@ void Driver::BuildActions(Compilation &C
     YcArg = YuArg = nullptr;
   }
 
-  // Track the host offload kinds used on this compilation.
-  unsigned CompilationActiveOffloadHostKinds = 0u;
+  // Builder to be used to build offloading actions.
+  OffloadingActionBuilder OffloadBuilder(C, Args, Inputs);
 
   // Construct the actions to perform.
   ActionList LinkerInputs;
@@ -1707,17 +2104,14 @@ void Driver::BuildActions(Compilation &C
       }
     }
 
-    phases::ID CudaInjectionPhase =
-        (phases::Compile < FinalPhase &&
-         llvm::find(PL, phases::Compile) != PL.end())
-            ? phases::Compile
-            : FinalPhase;
-
-    // Track the host offload kinds used on this input.
-    unsigned InputActiveOffloadHostKinds = 0u;
-
     // Build the pipeline for this file.
     Action *Current = C.MakeAction<InputAction>(*InputArg, InputType);
+
+    // Use the current host action in any of the offloading actions, if
+    // required.
+    if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+      break;
+
     for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
          i != e; ++i) {
       phases::ID Phase = *i;
@@ -1726,6 +2120,12 @@ void Driver::BuildActions(Compilation &C
       if (Phase > FinalPhase)
         break;
 
+      // Add any offload action the host action depends on.
+      Current = OffloadBuilder.addDeviceDependencesToHostAction(
+          Current, InputArg, Phase, FinalPhase, PL);
+      if (!Current)
+        break;
+
       // Queue linker inputs.
       if (Phase == phases::Link) {
         assert((i + 1) == e && "linking must be final compilation step.");
@@ -1734,48 +2134,37 @@ void Driver::BuildActions(Compilation &C
         break;
       }
 
-      // Some types skip the assembler phase (e.g., llvm-bc), but we can't
-      // encode this in the steps because the intermediate type depends on
-      // arguments. Just special case here.
-      if (Phase == phases::Assemble && Current->getType() != types::TY_PP_Asm)
+      // Otherwise construct the appropriate action.
+      auto *NewCurrent = ConstructPhaseAction(C, Args, Phase, Current);
+
+      // We didn't create a new action, so we will just move to the next phase.
+      if (NewCurrent == Current)
         continue;
 
-      // Otherwise construct the appropriate action.
-      Current = ConstructPhaseAction(C, Args, Phase, Current);
+      Current = NewCurrent;
 
-      if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase) {
-        Current = buildCudaActions(C, Args, InputArg, Current, Actions);
-        if (!Current)
-          break;
-
-        // We produced a CUDA action for this input, so the host has to support
-        // CUDA.
-        InputActiveOffloadHostKinds |= Action::OFK_Cuda;
-        CompilationActiveOffloadHostKinds |= Action::OFK_Cuda;
-      }
+      // Use the current host action in any of the offloading actions, if
+      // required.
+      if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+        break;
 
       if (Current->getType() == types::TY_Nothing)
         break;
     }
 
-    // If we ended with something, add to the output list. Also, propagate the
-    // offload information to the top-level host action related with the current
-    // input.
-    if (Current) {
-      if (InputActiveOffloadHostKinds)
-        Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds,
-                                          /*BoundArch=*/nullptr);
+    // If we ended with something, add to the output list.
+    if (Current)
       Actions.push_back(Current);
-    }
+
+    // Add any top level actions generated for offloading.
+    OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg);
   }
 
-  // Add a link action if necessary and propagate the offload information for
-  // the current compilation.
+  // Add a link action if necessary.
   if (!LinkerInputs.empty()) {
-    Actions.push_back(
-        C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image));
-    Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds,
-                                             /*BoundArch=*/nullptr);
+    Action *LA = C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image);
+    LA = OffloadBuilder.processHostLinkAction(LA);
+    Actions.push_back(LA);
   }
 
   // If we are linking, claim any options which are obviously only used for
@@ -1797,6 +2186,13 @@ void Driver::BuildActions(Compilation &C
 Action *Driver::ConstructPhaseAction(Compilation &C, const ArgList &Args,
                                      phases::ID Phase, Action *Input) const {
   llvm::PrettyStackTraceString CrashInfo("Constructing phase actions");
+
+  // Some types skip the assembler phase (e.g., llvm-bc), but we can't
+  // encode this in the steps because the intermediate type depends on
+  // arguments. Just special case here.
+  if (Phase == phases::Assemble && Input->getType() != types::TY_PP_Asm)
+    return Input;
+
   // Build the appropriate action.
   switch (Phase) {
   case phases::Link:

Modified: cfe/trunk/lib/Driver/Types.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Types.cpp?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Types.cpp (original)
+++ cfe/trunk/lib/Driver/Types.cpp Fri Sep 30 10:34:19 2016
@@ -254,7 +254,7 @@ void types::getCompilationPhases(ID Id,
     }
   }
 
-  if (!onlyPrecompileType(Id) && Id != TY_CUDA_DEVICE) {
+  if (!onlyPrecompileType(Id)) {
     P.push_back(phases::Link);
   }
   assert(0 < P.size() && "Not enough phases in list");

Modified: cfe/trunk/test/Driver/cuda-bindings.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-bindings.cu?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================
--- cfe/trunk/test/Driver/cuda-bindings.cu (original)
+++ cfe/trunk/test/Driver/cuda-bindings.cu Fri Sep 30 10:34:19 2016
@@ -34,8 +34,8 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --cuda-gpu-arch=sm_30 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefix=ASM %s
-// ASM: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s"
-// ASM: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
+// ASM-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s"
+// ASM-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
 
 //
 // Test two gpu architectures with complete compilation.
@@ -62,9 +62,9 @@
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \
 // RUN:        --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefix=ASM2 %s
-// ASM2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s"
-// ASM2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_35.s"
-// ASM2: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
+// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s"
+// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_35.s"
+// ASM2-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
 
 //
 // Test one or more gpu architecture with complete compilation in host-only

Modified: cfe/trunk/test/Driver/cuda-phases.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-phases.cu?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================
--- cfe/trunk/test/Driver/cuda-phases.cu (original)
+++ cfe/trunk/test/Driver/cuda-phases.cu Fri Sep 30 10:34:19 2016
@@ -13,84 +13,84 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
 // RUN: | FileCheck -check-prefix=BIN %s
-// BIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// BIN: 2: compiler, {1}, ir, (host-cuda)
-// BIN: 3: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN: 5: compiler, {4}, ir, (device-cuda, sm_30)
-// BIN: 6: backend, {5}, assembler, (device-cuda, sm_30)
-// BIN: 7: assembler, {6}, object, (device-cuda, sm_30)
-// BIN: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
-// BIN: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
-// BIN: 10: linker, {8, 9}, cuda-fatbin, (device-cuda)
-// BIN: 11: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {10}, ir
-// BIN: 12: backend, {11}, assembler, (host-cuda)
-// BIN: 13: assembler, {12}, object, (host-cuda)
-// BIN: 14: linker, {13}, image, (host-cuda)
+// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
+// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
+// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
+// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
+// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
+// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda)
+// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir
+// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda)
+// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda)
+// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda)
 
 //
 // Test single gpu architecture up to the assemble phase.
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefix=ASM %s
-// ASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// ASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// ASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
-// ASM: 5: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM: 6: preprocessor, {5}, cuda-cpp-output, (host-cuda)
-// ASM: 7: compiler, {6}, ir, (host-cuda)
-// ASM: 8: backend, {7}, assembler, (host-cuda)
+// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda)
+// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda)
+// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda)
 
 //
 // Test two gpu architectures with complete compilation.
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
 // RUN: | FileCheck -check-prefix=BIN2 %s
-// BIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// BIN2: 2: compiler, {1}, ir, (host-cuda)
-// BIN2: 3: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN2: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN2: 5: compiler, {4}, ir, (device-cuda, sm_30)
-// BIN2: 6: backend, {5}, assembler, (device-cuda, sm_30)
-// BIN2: 7: assembler, {6}, object, (device-cuda, sm_30)
-// BIN2: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
-// BIN2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
-// BIN2: 10: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// BIN2: 11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_35)
-// BIN2: 12: compiler, {11}, ir, (device-cuda, sm_35)
-// BIN2: 13: backend, {12}, assembler, (device-cuda, sm_35)
-// BIN2: 14: assembler, {13}, object, (device-cuda, sm_35)
-// BIN2: 15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {14}, object
-// BIN2: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {13}, assembler
-// BIN2: 17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda)
-// BIN2: 18: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir
-// BIN2: 19: backend, {18}, assembler, (host-cuda)
-// BIN2: 20: assembler, {19}, object, (host-cuda)
-// BIN2: 21: linker, {20}, image, (host-cuda)
+// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
+// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
+// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
+// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
+// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
+// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35)
+// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35)
+// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35)
+// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35)
+// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object
+// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler
+// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda)
+// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir
+// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda)
+// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda)
+// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda)
 
 //
 // Test two gpu architecturess up to the assemble phase.
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
 // RUN: | FileCheck -check-prefix=ASM2 %s
-// ASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// ASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// ASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
-// ASM2: 5: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// ASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
-// ASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
-// ASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
-// ASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
-// ASM2: 10: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM2: 11: preprocessor, {10}, cuda-cpp-output, (host-cuda)
-// ASM2: 12: compiler, {11}, ir, (host-cuda)
-// ASM2: 13: backend, {12}, assembler, (host-cuda)
+// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
+// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
+// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
+// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
+// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda)
+// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda)
+// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda)
 
 //
 // Test single gpu architecture with complete compilation in host-only
@@ -98,25 +98,22 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
 // RUN: | FileCheck -check-prefix=HBIN %s
-// HBIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// HBIN: 2: compiler, {1}, ir, (host-cuda)
-// HBIN: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
-// HBIN: 4: backend, {3}, assembler, (host-cuda)
-// HBIN: 5: assembler, {4}, object, (host-cuda)
-// HBIN: 6: linker, {5}, image, (host-cuda)
-
+// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
+// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
 //
 // Test single gpu architecture up to the assemble phase in host-only
 // compilation mode.
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
 // RUN: | FileCheck -check-prefix=HASM %s
-// HASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// HASM: 2: compiler, {1}, ir, (host-cuda)
-// HASM: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
-// HASM: 4: backend, {3}, assembler, (host-cuda)
+// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
 
 //
 // Test two gpu architectures with complete compilation in host-only
@@ -124,13 +121,12 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
 // RUN: | FileCheck -check-prefix=HBIN2 %s
-// HBIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// HBIN2: 2: compiler, {1}, ir, (host-cuda)
-// HBIN2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
-// HBIN2: 4: backend, {3}, assembler, (host-cuda)
-// HBIN2: 5: assembler, {4}, object, (host-cuda)
-// HBIN2: 6: linker, {5}, image, (host-cuda)
+// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
+// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
 
 //
 // Test two gpu architectures up to the assemble phase in host-only
@@ -138,11 +134,10 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
 // RUN: | FileCheck -check-prefix=HASM2 %s
-// HASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// HASM2: 2: compiler, {1}, ir, (host-cuda)
-// HASM2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
-// HASM2: 4: backend, {3}, assembler, (host-cuda)
+// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
 
 //
 // Test single gpu architecture with complete compilation in device-only
@@ -150,12 +145,12 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
 // RUN: | FileCheck -check-prefix=DBIN %s
-// DBIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// DBIN: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// DBIN: 4: assembler, {3}, object, (device-cuda, sm_30)
-// DBIN: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
+// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
 
 //
 // Test single gpu architecture up to the assemble phase in device-only
@@ -163,11 +158,11 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
 // RUN: | FileCheck -check-prefix=DASM %s
-// DASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// DASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// DASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
 
 //
 // Test two gpu architectures with complete compilation in device-only
@@ -175,18 +170,18 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
 // RUN: | FileCheck -check-prefix=DBIN2 %s
-// DBIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN2: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// DBIN2: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// DBIN2: 4: assembler, {3}, object, (device-cuda, sm_30)
-// DBIN2: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
-// DBIN2: 6: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DBIN2: 7: preprocessor, {6}, cuda-cpp-output, (device-cuda, sm_35)
-// DBIN2: 8: compiler, {7}, ir, (device-cuda, sm_35)
-// DBIN2: 9: backend, {8}, assembler, (device-cuda, sm_35)
-// DBIN2: 10: assembler, {9}, object, (device-cuda, sm_35)
-// DBIN2: 11: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {10}, object
+// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
+// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
+// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35)
+// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35)
+// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35)
+// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35)
+// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object
 
 //
 // Test two gpu architectures up to the assemble phase in device-only
@@ -194,13 +189,13 @@
 //
 // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
 // RUN: | FileCheck -check-prefix=DASM2 %s
-// DASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// DASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// DASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
-// DASM2: 5: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
-// DASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
-// DASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
-// DASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
+// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
+// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
+// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
+// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler