r282865 - [CUDA][OpenMP] Add a generic offload action builder
Samuel Antao via cfe-commits
cfe-commits at lists.llvm.org
Fri Sep 30 08:34:19 PDT 2016
Author: sfantao
Date: Fri Sep 30 10:34:19 2016
New Revision: 282865
URL: http://llvm.org/viewvc/llvm-project?rev=282865&view=rev
Log:
[CUDA][OpenMP] Add a generic offload action builder
Summary:
This patch proposes a new class to generate and record action dependences related with offloading. The builder provides three main functionalities:
- Add device dependences to host actions.
- Add host dependence to device actions.
- Register device top-level actions.
The constructor of the builder detect the programming models that should be supported, and generates a specialized builder for each. If a new programming model is to be added in the future, only a new specialized builder has to be implemented.
When the specialized builder is generated, it produces programming-model-specific diagnostics.
A CUDA specialized builder is proposed in the patch that mostly consists of the partition of the current `buildCudaAction` by the three different functionalities.
Reviewers: tra, echristo, ABataev, jlebar, hfinkel
Subscribers: Hahnfeld, whchung, guansong, jlebar, mehdi_amini, andreybokhanko, tcramer, mkuron, cfe-commits, arpith-jacob, carlo.bertolli, caomhin
Differential Revision: https://reviews.llvm.org/D18172
Modified:
cfe/trunk/include/clang/Driver/Compilation.h
cfe/trunk/lib/Driver/Driver.cpp
cfe/trunk/lib/Driver/Types.cpp
cfe/trunk/test/Driver/cuda-bindings.cu
cfe/trunk/test/Driver/cuda-phases.cu
Modified: cfe/trunk/include/clang/Driver/Compilation.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/Compilation.h?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================
--- cfe/trunk/include/clang/Driver/Compilation.h (original)
+++ cfe/trunk/include/clang/Driver/Compilation.h Fri Sep 30 10:34:19 2016
@@ -115,6 +115,12 @@ public:
return OrderedOffloadingToolchains.equal_range(Kind);
}
+ /// Return true if an offloading tool chain of a given kind exists.
+ template <Action::OffloadKind Kind> bool hasOffloadToolChain() const {
+ return OrderedOffloadingToolchains.find(Kind) !=
+ OrderedOffloadingToolchains.end();
+ }
+
/// Return an offload toolchain of the provided kind. Only one is expected to
/// exist.
template <Action::OffloadKind Kind>
Modified: cfe/trunk/lib/Driver/Driver.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Driver.cpp?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Driver.cpp (original)
+++ cfe/trunk/lib/Driver/Driver.cpp Fri Sep 30 10:34:19 2016
@@ -1400,139 +1400,536 @@ void Driver::BuildInputs(const ToolChain
}
}
-// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE
-// input action and then wraps each in CudaDeviceAction paired with
-// appropriate GPU arch name. In case of partial (i.e preprocessing
-// only) or device-only compilation, each device action is added to /p
-// Actions and /p Current is released. Otherwise the function creates
-// and returns a new CudaHostAction which wraps /p Current and device
-// side actions.
-static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
- const Arg *InputArg, Action *HostAction,
- ActionList &Actions) {
- Arg *PartialCompilationArg = Args.getLastArg(
- options::OPT_cuda_host_only, options::OPT_cuda_device_only,
- options::OPT_cuda_compile_host_device);
- bool CompileHostOnly =
- PartialCompilationArg &&
- PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only);
- bool CompileDeviceOnly =
- PartialCompilationArg &&
- PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only);
- const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
- assert(HostTC && "No toolchain for host compilation.");
- if (HostTC->getTriple().isNVPTX()) {
- // We do not support targeting NVPTX for host compilation. Throw
- // an error and abort pipeline construction early so we don't trip
- // asserts that assume device-side compilation.
- C.getDriver().Diag(diag::err_drv_cuda_nvptx_host);
- return nullptr;
- }
-
- if (CompileHostOnly) {
- OffloadAction::HostDependence HDep(*HostAction, *HostTC,
- /*BoundArch=*/nullptr, Action::OFK_Cuda);
- return C.MakeAction<OffloadAction>(HDep);
- }
-
- // Collect all cuda_gpu_arch parameters, removing duplicates.
- SmallVector<CudaArch, 4> GpuArchList;
- llvm::SmallSet<CudaArch, 4> GpuArchs;
- for (Arg *A : Args) {
- if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
- continue;
- A->claim();
+namespace {
+/// Provides a convenient interface for different programming models to generate
+/// the required device actions.
+class OffloadingActionBuilder final {
+ /// Flag used to trace errors in the builder.
+ bool IsValid = false;
+
+ /// The compilation that is using this builder.
+ Compilation &C;
+
+ /// The derived arguments associated with this builder.
+ DerivedArgList &Args;
+
+ /// Map between an input argument and the offload kinds used to process it.
+ std::map<const Arg *, unsigned> InputArgToOffloadKindMap;
+
+ /// Builder interface. It doesn't build anything or keep any state.
+ class DeviceActionBuilder {
+ public:
+ typedef llvm::SmallVector<phases::ID, phases::MaxNumberOfPhases> PhasesTy;
+
+ enum ActionBuilderReturnCode {
+ // The builder acted successfully on the current action.
+ ABRT_Success,
+ // The builder didn't have to act on the current action.
+ ABRT_Inactive,
+ // The builder was successful and requested the host action to not be
+ // generated.
+ ABRT_Ignore_Host,
+ };
+
+ protected:
+ /// Compilation associated with this builder.
+ Compilation &C;
+
+ /// Tool chains associated with this builder. The same programming
+ /// model may have associated one or more tool chains.
+ SmallVector<const ToolChain *, 2> ToolChains;
+
+ /// The derived arguments associated with this builder.
+ DerivedArgList &Args;
+
+ /// The inputs associated with this builder.
+ const Driver::InputList &Inputs;
+
+ /// The associated offload kind.
+ Action::OffloadKind AssociatedOffloadKind = Action::OFK_None;
+
+ public:
+ DeviceActionBuilder(Compilation &C, DerivedArgList &Args,
+ const Driver::InputList &Inputs,
+ Action::OffloadKind AssociatedOffloadKind)
+ : C(C), Args(Args), Inputs(Inputs),
+ AssociatedOffloadKind(AssociatedOffloadKind) {}
+ virtual ~DeviceActionBuilder() {}
+
+ /// Fill up the array \a DA with all the device dependences that should be
+ /// added to the provided host action \a HostAction. By default it is
+ /// inactive.
+ virtual ActionBuilderReturnCode
+ getDeviceDepences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase,
+ phases::ID FinalPhase, PhasesTy &Phases) {
+ return ABRT_Inactive;
+ }
+
+ /// Update the state to include the provided host action \a HostAction as a
+ /// dependency of the current device action. By default it is inactive.
+ virtual ActionBuilderReturnCode addDeviceDepences(Action *HostAction) {
+ return ABRT_Inactive;
+ }
+
+ /// Append top level actions generated by the builder. Return true if errors
+ /// were found.
+ virtual void appendTopLevelActions(ActionList &AL) {}
+
+ /// Append linker actions generated by the builder. Return true if errors
+ /// were found.
+ virtual void appendLinkDependences(OffloadAction::DeviceDependences &DA) {}
+
+ /// Initialize the builder. Return true if any initialization errors are
+ /// found.
+ virtual bool initialize() { return false; }
+
+ /// Return true if this builder is valid. We have a valid builder if we have
+ /// associated device tool chains.
+ bool isValid() { return !ToolChains.empty(); }
+
+ /// Return the associated offload kind.
+ Action::OffloadKind getAssociatedOffloadKind() {
+ return AssociatedOffloadKind;
+ }
+ };
+
+ /// \brief CUDA action builder. It injects device code in the host backend
+ /// action.
+ class CudaActionBuilder final : public DeviceActionBuilder {
+ /// Flags to signal if the user requested host-only or device-only
+ /// compilation.
+ bool CompileHostOnly = false;
+ bool CompileDeviceOnly = false;
+
+ /// List of GPU architectures to use in this compilation.
+ SmallVector<CudaArch, 4> GpuArchList;
+
+ /// The CUDA actions for the current input.
+ ActionList CudaDeviceActions;
+
+ /// The CUDA fat binary if it was generated for the current input.
+ Action *CudaFatBinary = nullptr;
+
+ /// Flag that is set to true if this builder acted on the current input.
+ bool IsActive = false;
+
+ public:
+ CudaActionBuilder(Compilation &C, DerivedArgList &Args,
+ const Driver::InputList &Inputs)
+ : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {}
+
+ ActionBuilderReturnCode
+ getDeviceDepences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase,
+ phases::ID FinalPhase, PhasesTy &Phases) override {
+ if (!IsActive)
+ return ABRT_Inactive;
+
+ // If we don't have more CUDA actions, we don't have any dependences to
+ // create for the host.
+ if (CudaDeviceActions.empty())
+ return ABRT_Success;
+
+ assert(CudaDeviceActions.size() == GpuArchList.size() &&
+ "Expecting one action per GPU architecture.");
+ assert(!CompileHostOnly &&
+ "Not expecting CUDA actions in host-only compilation.");
+
+ // If we are generating code for the device or we are in a backend phase,
+ // we attempt to generate the fat binary. We compile each arch to ptx and
+ // assemble to cubin, then feed the cubin *and* the ptx into a device
+ // "link" action, which uses fatbinary to combine these cubins into one
+ // fatbin. The fatbin is then an input to the host action if not in
+ // device-only mode.
+ if (CompileDeviceOnly || CurPhase == phases::Backend) {
+ ActionList DeviceActions;
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+ // Produce the device action from the current phase up to the assemble
+ // phase.
+ for (auto Ph : Phases) {
+ // Skip the phases that were already dealt with.
+ if (Ph < CurPhase)
+ continue;
+ // We have to be consistent with the host final phase.
+ if (Ph > FinalPhase)
+ break;
+
+ CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
+ C, Args, Ph, CudaDeviceActions[I]);
+
+ if (Ph == phases::Assemble)
+ break;
+ }
+
+ // If we didn't reach the assemble phase, we can't generate the fat
+ // binary. We don't need to generate the fat binary if we are not in
+ // device-only mode.
+ if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
+ CompileDeviceOnly)
+ continue;
+
+ Action *AssembleAction = CudaDeviceActions[I];
+ assert(AssembleAction->getType() == types::TY_Object);
+ assert(AssembleAction->getInputs().size() == 1);
+
+ Action *BackendAction = AssembleAction->getInputs()[0];
+ assert(BackendAction->getType() == types::TY_PP_Asm);
+
+ for (auto &A : {AssembleAction, BackendAction}) {
+ OffloadAction::DeviceDependences DDep;
+ DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
+ Action::OFK_Cuda);
+ DeviceActions.push_back(
+ C.MakeAction<OffloadAction>(DDep, A->getType()));
+ }
+ }
+
+ // We generate the fat binary if we have device input actions.
+ if (!DeviceActions.empty()) {
+ CudaFatBinary =
+ C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
+ if (!CompileDeviceOnly) {
+ DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+ Action::OFK_Cuda);
+ // Clear the fat binary, it is already a dependence to an host
+ // action.
+ CudaFatBinary = nullptr;
+ }
+
+ // Remove the CUDA actions as they are already connected to an host
+ // action or fat binary.
+ CudaDeviceActions.clear();
+ }
- const auto &ArchStr = A->getValue();
- CudaArch Arch = StringToCudaArch(ArchStr);
- if (Arch == CudaArch::UNKNOWN)
- C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
- else if (GpuArchs.insert(Arch).second)
- GpuArchList.push_back(Arch);
- }
-
- // Default to sm_20 which is the lowest common denominator for supported GPUs.
- // sm_20 code should work correctly, if suboptimally, on all newer GPUs.
- if (GpuArchList.empty())
- GpuArchList.push_back(CudaArch::SM_20);
-
- // Replicate inputs for each GPU architecture.
- Driver::InputList CudaDeviceInputs;
- for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
- CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
-
- // Build actions for all device inputs.
- ActionList CudaDeviceActions;
- C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions);
- assert(GpuArchList.size() == CudaDeviceActions.size() &&
- "Failed to create actions for all devices");
-
- // Check whether any of device actions stopped before they could generate PTX.
- bool PartialCompilation =
- llvm::any_of(CudaDeviceActions, [](const Action *a) {
- return a->getKind() != Action::AssembleJobClass;
- });
-
- const ToolChain *CudaTC = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
-
- // Figure out what to do with device actions -- pass them as inputs to the
- // host action or run each of them independently.
- if (PartialCompilation || CompileDeviceOnly) {
- // In case of partial or device-only compilation results of device actions
- // are not consumed by the host action device actions have to be added to
- // top-level actions list with AtTopLevel=true and run independently.
-
- // -o is ambiguous if we have more than one top-level action.
- if (Args.hasArg(options::OPT_o) &&
- (!CompileDeviceOnly || GpuArchList.size() > 1)) {
- C.getDriver().Diag(
- clang::diag::err_drv_output_argument_with_multiple_files);
- return nullptr;
+ // We avoid creating host action in device-only mode.
+ return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
+ }
+
+ assert(CurPhase < phases::Backend && "Generating single CUDA "
+ "instructions should only occur "
+ "before the backend phase!");
+
+ // By default, we produce an action for each device arch.
+ for (Action *&A : CudaDeviceActions)
+ A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
+
+ return ABRT_Success;
+ }
+
+ ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
+ // While generating code for CUDA, we only depend on the host input action
+ // to trigger the creation of all the CUDA device actions.
+
+ // If we are dealing with an input action, replicate it for each GPU
+ // architecture. If we are in host-only mode we return 'success' so that
+ // the host uses the CUDA offload kind.
+ if (auto *IA = dyn_cast<InputAction>(HostAction)) {
+ assert(!GpuArchList.empty() &&
+ "We should have at least one GPU architecture.");
+
+ // If the host input is not CUDA, we don't need to bother about this
+ // input.
+ if (IA->getType() != types::TY_CUDA) {
+ // The builder will ignore this input.
+ IsActive = false;
+ return ABRT_Inactive;
+ }
+
+ // Set the flag to true, so that the builder acts on the current input.
+ IsActive = true;
+
+ if (CompileHostOnly)
+ return ABRT_Success;
+
+ // Replicate inputs for each GPU architecture.
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
+ CudaDeviceActions.push_back(C.MakeAction<InputAction>(
+ IA->getInputArg(), types::TY_CUDA_DEVICE));
+
+ return ABRT_Success;
+ }
+
+ return IsActive ? ABRT_Success : ABRT_Inactive;
}
- for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
- OffloadAction::DeviceDependences DDep;
- DDep.add(*CudaDeviceActions[I], *CudaTC, CudaArchToString(GpuArchList[I]),
- Action::OFK_Cuda);
- Actions.push_back(
- C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType()));
+ void appendTopLevelActions(ActionList &AL) override {
+ // Utility to append actions to the top level list.
+ auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
+ OffloadAction::DeviceDependences Dep;
+ Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
+ Action::OFK_Cuda);
+ AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
+ };
+
+ // If we have a fat binary, add it to the list.
+ if (CudaFatBinary) {
+ AddTopLevel(CudaFatBinary, CudaArch::UNKNOWN);
+ CudaDeviceActions.clear();
+ CudaFatBinary = nullptr;
+ return;
+ }
+
+ if (CudaDeviceActions.empty())
+ return;
+
+ // If we have CUDA actions at this point, that's because we have a have
+ // partial compilation, so we should have an action for each GPU
+ // architecture.
+ assert(CudaDeviceActions.size() == GpuArchList.size() &&
+ "Expecting one action per GPU architecture.");
+ assert(ToolChains.size() == 1 &&
+ "Expecting to have a sing CUDA toolchain.");
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
+ AddTopLevel(CudaDeviceActions[I], GpuArchList[I]);
+
+ CudaDeviceActions.clear();
+ }
+
+ bool initialize() override {
+ // We don't need to support CUDA.
+ if (!C.hasOffloadToolChain<Action::OFK_Cuda>())
+ return false;
+
+ const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
+ assert(HostTC && "No toolchain for host compilation.");
+ if (HostTC->getTriple().isNVPTX()) {
+ // We do not support targeting NVPTX for host compilation. Throw
+ // an error and abort pipeline construction early so we don't trip
+ // asserts that assume device-side compilation.
+ C.getDriver().Diag(diag::err_drv_cuda_nvptx_host);
+ return true;
+ }
+
+ ToolChains.push_back(C.getSingleOffloadToolChain<Action::OFK_Cuda>());
+
+ Arg *PartialCompilationArg = Args.getLastArg(
+ options::OPT_cuda_host_only, options::OPT_cuda_device_only,
+ options::OPT_cuda_compile_host_device);
+ CompileHostOnly = PartialCompilationArg &&
+ PartialCompilationArg->getOption().matches(
+ options::OPT_cuda_host_only);
+ CompileDeviceOnly = PartialCompilationArg &&
+ PartialCompilationArg->getOption().matches(
+ options::OPT_cuda_device_only);
+
+ // Collect all cuda_gpu_arch parameters, removing duplicates.
+ llvm::SmallSet<CudaArch, 4> GpuArchs;
+ bool Error = false;
+ for (Arg *A : Args) {
+ if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
+ continue;
+ A->claim();
+
+ const auto &ArchStr = A->getValue();
+ CudaArch Arch = StringToCudaArch(ArchStr);
+ if (Arch == CudaArch::UNKNOWN) {
+ C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
+ Error = true;
+ } else if (GpuArchs.insert(Arch).second)
+ GpuArchList.push_back(Arch);
+ }
+
+ // Default to sm_20 which is the lowest common denominator for supported
+ // GPUs.
+ // sm_20 code should work correctly, if suboptimally, on all newer GPUs.
+ if (GpuArchList.empty())
+ GpuArchList.push_back(CudaArch::SM_20);
+
+ return Error;
}
- // Kill host action in case of device-only compilation.
- if (CompileDeviceOnly)
+ };
+
+ /// Add the implementation for other specialized builders here.
+
+ /// Specialized builders being used by this offloading action builder.
+ SmallVector<DeviceActionBuilder *, 4> SpecializedBuilders;
+
+public:
+ OffloadingActionBuilder(Compilation &C, DerivedArgList &Args,
+ const Driver::InputList &Inputs)
+ : C(C), Args(Args) {
+ // Create a specialized builder for each device toolchain.
+
+ IsValid = true;
+
+ // Create a specialized builder for CUDA.
+ SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs));
+
+ //
+ // TODO: Build other specialized builders here.
+ //
+
+ // Initialize all the builders, keeping track of errors.
+ for (auto *SB : SpecializedBuilders)
+ IsValid = IsValid && !SB->initialize();
+ }
+
+ ~OffloadingActionBuilder() {
+ for (auto *SB : SpecializedBuilders)
+ delete SB;
+ }
+
+ /// Generate an action that adds device dependences (if any) to a host action.
+ /// If no device dependence actions exist, just return the host action \a
+ /// HostAction. If an error is found or if no builder requires the host action
+ /// to be generated, return nullptr.
+ Action *
+ addDeviceDependencesToHostAction(Action *HostAction, const Arg *InputArg,
+ phases::ID CurPhase, phases::ID FinalPhase,
+ DeviceActionBuilder::PhasesTy &Phases) {
+ if (!IsValid)
return nullptr;
- return HostAction;
+
+ if (SpecializedBuilders.empty())
+ return HostAction;
+
+ assert(HostAction && "Invalid host action!");
+
+ OffloadAction::DeviceDependences DDeps;
+ // Check if all the programming models agree we should not emit the host
+ // action. Also, keep track of the offloading kinds employed.
+ auto &OffloadKind = InputArgToOffloadKindMap[InputArg];
+ unsigned InactiveBuilders = 0u;
+ unsigned IgnoringBuilders = 0u;
+ for (auto *SB : SpecializedBuilders) {
+ if (!SB->isValid()) {
+ ++InactiveBuilders;
+ continue;
+ }
+
+ auto RetCode = SB->getDeviceDepences(DDeps, CurPhase, FinalPhase, Phases);
+
+ // If the builder explicitly says the host action should be ignored,
+ // we need to increment the variable that tracks the builders that request
+ // the host object to be ignored.
+ if (RetCode == DeviceActionBuilder::ABRT_Ignore_Host)
+ ++IgnoringBuilders;
+
+ // Unless the builder was inactive for this action, we have to record the
+ // offload kind because the host will have to use it.
+ if (RetCode != DeviceActionBuilder::ABRT_Inactive)
+ OffloadKind |= SB->getAssociatedOffloadKind();
+ }
+
+ // If all builders agree that the host object should be ignored, just return
+ // nullptr.
+ if (IgnoringBuilders &&
+ SpecializedBuilders.size() == (InactiveBuilders + IgnoringBuilders))
+ return nullptr;
+
+ if (DDeps.getActions().empty())
+ return HostAction;
+
+ // We have dependences we need to bundle together. We use an offload action
+ // for that.
+ OffloadAction::HostDependence HDep(
+ *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+ /*BoundArch=*/nullptr, DDeps);
+ return C.MakeAction<OffloadAction>(HDep, DDeps);
+ }
+
+ /// Generate an action that adds a host dependence to a device action. The
+ /// results will be kept in this action builder. Return true if an error was
+ /// found.
+ bool addHostDependenceToDeviceActions(Action *HostAction,
+ const Arg *InputArg) {
+ if (!IsValid)
+ return true;
+
+ assert(HostAction && "Invalid host action!");
+
+ // Register the offload kinds that are used.
+ auto &OffloadKind = InputArgToOffloadKindMap[InputArg];
+ for (auto *SB : SpecializedBuilders) {
+ if (!SB->isValid())
+ continue;
+
+ auto RetCode = SB->addDeviceDepences(HostAction);
+
+ // Host dependences for device actions are not compatible with that same
+ // action being ignored.
+ assert(RetCode != DeviceActionBuilder::ABRT_Ignore_Host &&
+ "Host dependence not expected to be ignored.!");
+
+ // Unless the builder was inactive for this action, we have to record the
+ // offload kind because the host will have to use it.
+ if (RetCode != DeviceActionBuilder::ABRT_Inactive)
+ OffloadKind |= SB->getAssociatedOffloadKind();
+ }
+
+ return false;
}
- // If we're not a partial or device-only compilation, we compile each arch to
- // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device
- // "link" action, which uses fatbinary to combine these cubins into one
- // fatbin. The fatbin is then an input to the host compilation.
- ActionList DeviceActions;
- for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
- Action* AssembleAction = CudaDeviceActions[I];
- assert(AssembleAction->getType() == types::TY_Object);
- assert(AssembleAction->getInputs().size() == 1);
-
- Action* BackendAction = AssembleAction->getInputs()[0];
- assert(BackendAction->getType() == types::TY_PP_Asm);
-
- for (auto &A : {AssembleAction, BackendAction}) {
- OffloadAction::DeviceDependences DDep;
- DDep.add(*A, *CudaTC, CudaArchToString(GpuArchList[I]), Action::OFK_Cuda);
- DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
- }
- }
- auto FatbinAction =
- C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
-
- // Return a new host action that incorporates original host action and all
- // device actions.
- OffloadAction::HostDependence HDep(*HostAction, *HostTC,
- /*BoundArch=*/nullptr, Action::OFK_Cuda);
- OffloadAction::DeviceDependences DDep;
- DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda);
- return C.MakeAction<OffloadAction>(HDep, DDep);
-}
+ /// Add the offloading top level actions to the provided action list.
+ bool appendTopLevelActions(ActionList &AL, Action *HostAction,
+ const Arg *InputArg) {
+ auto NumActions = AL.size();
+
+ for (auto *SB : SpecializedBuilders) {
+ if (!SB->isValid())
+ continue;
+ SB->appendTopLevelActions(AL);
+ }
+
+ assert(NumActions <= AL.size() && "Expecting more actions, not less!");
+
+ // Propagate to the current host action (if any) the offload information
+ // associated with the current input.
+ if (HostAction)
+ HostAction->propagateHostOffloadInfo(InputArgToOffloadKindMap[InputArg],
+ /*BoundArch=*/nullptr);
+
+ // If any action is added by the builders, -o is ambiguous if we have more
+ // than one top-level action.
+ if (NumActions < AL.size() && Args.hasArg(options::OPT_o) &&
+ AL.size() > 1) {
+ C.getDriver().Diag(
+ clang::diag::err_drv_output_argument_with_multiple_files);
+ return true;
+ }
+
+ return false;
+ }
+
+ /// Processes the host linker action. This currently consists of replacing it
+ /// with an offload action if there are device link objects and propagate to
+ /// the host action all the offload kinds used in the current compilation. The
+ /// resulting action is returned.
+ Action *processHostLinkAction(Action *HostAction) {
+ // Add all the dependences from the device linking actions.
+ OffloadAction::DeviceDependences DDeps;
+ for (auto *SB : SpecializedBuilders) {
+ if (!SB->isValid())
+ continue;
+
+ SB->appendLinkDependences(DDeps);
+ }
+
+ // Calculate all the offload kinds used in the current compilation.
+ unsigned ActiveOffloadKinds = 0u;
+ for (auto &I : InputArgToOffloadKindMap)
+ ActiveOffloadKinds |= I.second;
+
+ // If we don't have device dependencies, we don't have to create an offload
+ // action.
+ if (DDeps.getActions().empty()) {
+ // Propagate all the active kinds to host action. Given that it is a link
+ // action it is assumed to depend on all actions generated so far.
+ HostAction->propagateHostOffloadInfo(ActiveOffloadKinds,
+ /*BoundArch=*/nullptr);
+ return HostAction;
+ }
+
+ // Create the offload action with all dependences. When an offload action
+ // is created the kinds are propagated to the host action, so we don't have
+ // to do that explicitely here.
+ OffloadAction::HostDependence HDep(
+ *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+ /*BoundArch*/ nullptr, ActiveOffloadKinds);
+ return C.MakeAction<OffloadAction>(HDep, DDeps);
+ }
+};
+} // anonymous namespace.
void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
const InputList &Inputs, ActionList &Actions) const {
@@ -1640,8 +2037,8 @@ void Driver::BuildActions(Compilation &C
YcArg = YuArg = nullptr;
}
- // Track the host offload kinds used on this compilation.
- unsigned CompilationActiveOffloadHostKinds = 0u;
+ // Builder to be used to build offloading actions.
+ OffloadingActionBuilder OffloadBuilder(C, Args, Inputs);
// Construct the actions to perform.
ActionList LinkerInputs;
@@ -1707,17 +2104,14 @@ void Driver::BuildActions(Compilation &C
}
}
- phases::ID CudaInjectionPhase =
- (phases::Compile < FinalPhase &&
- llvm::find(PL, phases::Compile) != PL.end())
- ? phases::Compile
- : FinalPhase;
-
- // Track the host offload kinds used on this input.
- unsigned InputActiveOffloadHostKinds = 0u;
-
// Build the pipeline for this file.
Action *Current = C.MakeAction<InputAction>(*InputArg, InputType);
+
+ // Use the current host action in any of the offloading actions, if
+ // required.
+ if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+ break;
+
for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
i != e; ++i) {
phases::ID Phase = *i;
@@ -1726,6 +2120,12 @@ void Driver::BuildActions(Compilation &C
if (Phase > FinalPhase)
break;
+ // Add any offload action the host action depends on.
+ Current = OffloadBuilder.addDeviceDependencesToHostAction(
+ Current, InputArg, Phase, FinalPhase, PL);
+ if (!Current)
+ break;
+
// Queue linker inputs.
if (Phase == phases::Link) {
assert((i + 1) == e && "linking must be final compilation step.");
@@ -1734,48 +2134,37 @@ void Driver::BuildActions(Compilation &C
break;
}
- // Some types skip the assembler phase (e.g., llvm-bc), but we can't
- // encode this in the steps because the intermediate type depends on
- // arguments. Just special case here.
- if (Phase == phases::Assemble && Current->getType() != types::TY_PP_Asm)
+ // Otherwise construct the appropriate action.
+ auto *NewCurrent = ConstructPhaseAction(C, Args, Phase, Current);
+
+ // We didn't create a new action, so we will just move to the next phase.
+ if (NewCurrent == Current)
continue;
- // Otherwise construct the appropriate action.
- Current = ConstructPhaseAction(C, Args, Phase, Current);
+ Current = NewCurrent;
- if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase) {
- Current = buildCudaActions(C, Args, InputArg, Current, Actions);
- if (!Current)
- break;
-
- // We produced a CUDA action for this input, so the host has to support
- // CUDA.
- InputActiveOffloadHostKinds |= Action::OFK_Cuda;
- CompilationActiveOffloadHostKinds |= Action::OFK_Cuda;
- }
+ // Use the current host action in any of the offloading actions, if
+ // required.
+ if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+ break;
if (Current->getType() == types::TY_Nothing)
break;
}
- // If we ended with something, add to the output list. Also, propagate the
- // offload information to the top-level host action related with the current
- // input.
- if (Current) {
- if (InputActiveOffloadHostKinds)
- Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds,
- /*BoundArch=*/nullptr);
+ // If we ended with something, add to the output list.
+ if (Current)
Actions.push_back(Current);
- }
+
+ // Add any top level actions generated for offloading.
+ OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg);
}
- // Add a link action if necessary and propagate the offload information for
- // the current compilation.
+ // Add a link action if necessary.
if (!LinkerInputs.empty()) {
- Actions.push_back(
- C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image));
- Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds,
- /*BoundArch=*/nullptr);
+ Action *LA = C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image);
+ LA = OffloadBuilder.processHostLinkAction(LA);
+ Actions.push_back(LA);
}
// If we are linking, claim any options which are obviously only used for
@@ -1797,6 +2186,13 @@ void Driver::BuildActions(Compilation &C
Action *Driver::ConstructPhaseAction(Compilation &C, const ArgList &Args,
phases::ID Phase, Action *Input) const {
llvm::PrettyStackTraceString CrashInfo("Constructing phase actions");
+
+ // Some types skip the assembler phase (e.g., llvm-bc), but we can't
+ // encode this in the steps because the intermediate type depends on
+ // arguments. Just special case here.
+ if (Phase == phases::Assemble && Input->getType() != types::TY_PP_Asm)
+ return Input;
+
// Build the appropriate action.
switch (Phase) {
case phases::Link:
Modified: cfe/trunk/lib/Driver/Types.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Types.cpp?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Types.cpp (original)
+++ cfe/trunk/lib/Driver/Types.cpp Fri Sep 30 10:34:19 2016
@@ -254,7 +254,7 @@ void types::getCompilationPhases(ID Id,
}
}
- if (!onlyPrecompileType(Id) && Id != TY_CUDA_DEVICE) {
+ if (!onlyPrecompileType(Id)) {
P.push_back(phases::Link);
}
assert(0 < P.size() && "Not enough phases in list");
Modified: cfe/trunk/test/Driver/cuda-bindings.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-bindings.cu?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================
--- cfe/trunk/test/Driver/cuda-bindings.cu (original)
+++ cfe/trunk/test/Driver/cuda-bindings.cu Fri Sep 30 10:34:19 2016
@@ -34,8 +34,8 @@
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --cuda-gpu-arch=sm_30 %s -S 2>&1 \
// RUN: | FileCheck -check-prefix=ASM %s
-// ASM: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s"
-// ASM: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
+// ASM-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s"
+// ASM-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
//
// Test two gpu architectures with complete compilation.
@@ -62,9 +62,9 @@
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
// RUN: | FileCheck -check-prefix=ASM2 %s
-// ASM2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s"
-// ASM2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_35.s"
-// ASM2: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
+// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s"
+// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_35.s"
+// ASM2-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"
//
// Test one or more gpu architecture with complete compilation in host-only
Modified: cfe/trunk/test/Driver/cuda-phases.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-phases.cu?rev=282865&r1=282864&r2=282865&view=diff
==============================================================================
--- cfe/trunk/test/Driver/cuda-phases.cu (original)
+++ cfe/trunk/test/Driver/cuda-phases.cu Fri Sep 30 10:34:19 2016
@@ -13,84 +13,84 @@
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
// RUN: | FileCheck -check-prefix=BIN %s
-// BIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// BIN: 2: compiler, {1}, ir, (host-cuda)
-// BIN: 3: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN: 5: compiler, {4}, ir, (device-cuda, sm_30)
-// BIN: 6: backend, {5}, assembler, (device-cuda, sm_30)
-// BIN: 7: assembler, {6}, object, (device-cuda, sm_30)
-// BIN: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
-// BIN: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
-// BIN: 10: linker, {8, 9}, cuda-fatbin, (device-cuda)
-// BIN: 11: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {10}, ir
-// BIN: 12: backend, {11}, assembler, (host-cuda)
-// BIN: 13: assembler, {12}, object, (host-cuda)
-// BIN: 14: linker, {13}, image, (host-cuda)
+// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
+// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
+// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
+// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
+// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
+// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda)
+// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir
+// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda)
+// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda)
+// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda)
//
// Test single gpu architecture up to the assemble phase.
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
// RUN: | FileCheck -check-prefix=ASM %s
-// ASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// ASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// ASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
-// ASM: 5: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM: 6: preprocessor, {5}, cuda-cpp-output, (host-cuda)
-// ASM: 7: compiler, {6}, ir, (host-cuda)
-// ASM: 8: backend, {7}, assembler, (host-cuda)
+// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda)
+// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda)
+// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda)
//
// Test two gpu architectures with complete compilation.
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
// RUN: | FileCheck -check-prefix=BIN2 %s
-// BIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// BIN2: 2: compiler, {1}, ir, (host-cuda)
-// BIN2: 3: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN2: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN2: 5: compiler, {4}, ir, (device-cuda, sm_30)
-// BIN2: 6: backend, {5}, assembler, (device-cuda, sm_30)
-// BIN2: 7: assembler, {6}, object, (device-cuda, sm_30)
-// BIN2: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
-// BIN2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
-// BIN2: 10: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// BIN2: 11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_35)
-// BIN2: 12: compiler, {11}, ir, (device-cuda, sm_35)
-// BIN2: 13: backend, {12}, assembler, (device-cuda, sm_35)
-// BIN2: 14: assembler, {13}, object, (device-cuda, sm_35)
-// BIN2: 15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {14}, object
-// BIN2: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {13}, assembler
-// BIN2: 17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda)
-// BIN2: 18: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir
-// BIN2: 19: backend, {18}, assembler, (host-cuda)
-// BIN2: 20: assembler, {19}, object, (host-cuda)
-// BIN2: 21: linker, {20}, image, (host-cuda)
+// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
+// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
+// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
+// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
+// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
+// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35)
+// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35)
+// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35)
+// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35)
+// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object
+// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler
+// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda)
+// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir
+// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda)
+// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda)
+// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda)
//
// Test two gpu architecturess up to the assemble phase.
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
// RUN: | FileCheck -check-prefix=ASM2 %s
-// ASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// ASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// ASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
-// ASM2: 5: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// ASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
-// ASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
-// ASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
-// ASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
-// ASM2: 10: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM2: 11: preprocessor, {10}, cuda-cpp-output, (host-cuda)
-// ASM2: 12: compiler, {11}, ir, (host-cuda)
-// ASM2: 13: backend, {12}, assembler, (host-cuda)
+// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
+// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
+// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
+// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
+// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda)
+// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda)
+// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda)
//
// Test single gpu architecture with complete compilation in host-only
@@ -98,25 +98,22 @@
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
// RUN: | FileCheck -check-prefix=HBIN %s
-// HBIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// HBIN: 2: compiler, {1}, ir, (host-cuda)
-// HBIN: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
-// HBIN: 4: backend, {3}, assembler, (host-cuda)
-// HBIN: 5: assembler, {4}, object, (host-cuda)
-// HBIN: 6: linker, {5}, image, (host-cuda)
-
+// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
+// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
//
// Test single gpu architecture up to the assemble phase in host-only
// compilation mode.
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
// RUN: | FileCheck -check-prefix=HASM %s
-// HASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// HASM: 2: compiler, {1}, ir, (host-cuda)
-// HASM: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
-// HASM: 4: backend, {3}, assembler, (host-cuda)
+// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
//
// Test two gpu architectures with complete compilation in host-only
@@ -124,13 +121,12 @@
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
// RUN: | FileCheck -check-prefix=HBIN2 %s
-// HBIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// HBIN2: 2: compiler, {1}, ir, (host-cuda)
-// HBIN2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
-// HBIN2: 4: backend, {3}, assembler, (host-cuda)
-// HBIN2: 5: assembler, {4}, object, (host-cuda)
-// HBIN2: 6: linker, {5}, image, (host-cuda)
+// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
+// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
//
// Test two gpu architectures up to the assemble phase in host-only
@@ -138,11 +134,10 @@
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
// RUN: | FileCheck -check-prefix=HASM2 %s
-// HASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
-// HASM2: 2: compiler, {1}, ir, (host-cuda)
-// HASM2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
-// HASM2: 4: backend, {3}, assembler, (host-cuda)
+// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
+// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
+// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
+// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
//
// Test single gpu architecture with complete compilation in device-only
@@ -150,12 +145,12 @@
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
// RUN: | FileCheck -check-prefix=DBIN %s
-// DBIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// DBIN: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// DBIN: 4: assembler, {3}, object, (device-cuda, sm_30)
-// DBIN: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
+// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
//
// Test single gpu architecture up to the assemble phase in device-only
@@ -163,11 +158,11 @@
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
// RUN: | FileCheck -check-prefix=DASM %s
-// DASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// DASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// DASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
//
// Test two gpu architectures with complete compilation in device-only
@@ -175,18 +170,18 @@
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
// RUN: | FileCheck -check-prefix=DBIN2 %s
-// DBIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN2: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// DBIN2: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// DBIN2: 4: assembler, {3}, object, (device-cuda, sm_30)
-// DBIN2: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
-// DBIN2: 6: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DBIN2: 7: preprocessor, {6}, cuda-cpp-output, (device-cuda, sm_35)
-// DBIN2: 8: compiler, {7}, ir, (device-cuda, sm_35)
-// DBIN2: 9: backend, {8}, assembler, (device-cuda, sm_35)
-// DBIN2: 10: assembler, {9}, object, (device-cuda, sm_35)
-// DBIN2: 11: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {10}, object
+// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
+// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
+// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35)
+// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35)
+// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35)
+// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35)
+// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object
//
// Test two gpu architectures up to the assemble phase in device-only
@@ -194,13 +189,13 @@
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
// RUN: | FileCheck -check-prefix=DASM2 %s
-// DASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
-// DASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
-// DASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
-// DASM2: 5: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
-// DASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
-// DASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
-// DASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
+// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
+// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
+// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
+// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
+// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
+// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
+// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
+// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
More information about the cfe-commits
mailing list