r333483 - Add action builder for HIP
Yaxun Liu via cfe-commits
cfe-commits at lists.llvm.org
Tue May 29 17:49:10 PDT 2018
Author: yaxunl
Date: Tue May 29 17:49:10 2018
New Revision: 333483
URL: http://llvm.org/viewvc/llvm-project?rev=333483&view=rev
Log:
Add action builder for HIP
To support separate compile/link and linking across device IR in different source files,
a new HIP action builder is introduced. Basically it compiles/links host and device
code separately, and embed fat binary in host linking stage through linker script.
Differential Revision: https://reviews.llvm.org/D46476
Modified:
cfe/trunk/lib/Driver/Driver.cpp
cfe/trunk/test/Driver/cuda-phases.cu
Modified: cfe/trunk/lib/Driver/Driver.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Driver.cpp?rev=333483&r1=333482&r2=333483&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Driver.cpp (original)
+++ cfe/trunk/lib/Driver/Driver.cpp Tue May 29 17:49:10 2018
@@ -2151,9 +2151,10 @@ class OffloadingActionBuilder final {
}
};
- /// CUDA action builder. It injects device code in the host backend
- /// action.
- class CudaActionBuilder final : public DeviceActionBuilder {
+ /// Base class for CUDA/HIP action builder. It injects device code in
+ /// the host backend action.
+ class CudaActionBuilderBase : public DeviceActionBuilder {
+ protected:
/// Flags to signal if the user requested host-only or device-only
/// compilation.
bool CompileHostOnly = false;
@@ -2170,115 +2171,11 @@ class OffloadingActionBuilder final {
/// Flag that is set to true if this builder acted on the current input.
bool IsActive = false;
-
public:
- CudaActionBuilder(Compilation &C, DerivedArgList &Args,
- const Driver::InputList &Inputs)
- : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {}
-
- ActionBuilderReturnCode
- getDeviceDependences(OffloadAction::DeviceDependences &DA,
- phases::ID CurPhase, phases::ID FinalPhase,
- PhasesTy &Phases) override {
- if (!IsActive)
- return ABRT_Inactive;
-
- // If we don't have more CUDA actions, we don't have any dependences to
- // create for the host.
- if (CudaDeviceActions.empty())
- return ABRT_Success;
-
- assert(CudaDeviceActions.size() == GpuArchList.size() &&
- "Expecting one action per GPU architecture.");
- assert(!CompileHostOnly &&
- "Not expecting CUDA actions in host-only compilation.");
-
- // If we are generating code for the device or we are in a backend phase,
- // we attempt to generate the fat binary. We compile each arch to ptx and
- // assemble to cubin, then feed the cubin *and* the ptx into a device
- // "link" action, which uses fatbinary to combine these cubins into one
- // fatbin. The fatbin is then an input to the host action if not in
- // device-only mode.
- if (CompileDeviceOnly || CurPhase == phases::Backend) {
- ActionList DeviceActions;
- for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
- // Produce the device action from the current phase up to the assemble
- // phase.
- for (auto Ph : Phases) {
- // Skip the phases that were already dealt with.
- if (Ph < CurPhase)
- continue;
- // We have to be consistent with the host final phase.
- if (Ph > FinalPhase)
- break;
-
- CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
- C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda);
-
- if (Ph == phases::Assemble)
- break;
- }
-
- // If we didn't reach the assemble phase, we can't generate the fat
- // binary. We don't need to generate the fat binary if we are not in
- // device-only mode.
- if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
- CompileDeviceOnly)
- continue;
-
- Action *AssembleAction = CudaDeviceActions[I];
- assert(AssembleAction->getType() == types::TY_Object);
- assert(AssembleAction->getInputs().size() == 1);
-
- Action *BackendAction = AssembleAction->getInputs()[0];
- assert(BackendAction->getType() == types::TY_PP_Asm);
-
- for (auto &A : {AssembleAction, BackendAction}) {
- OffloadAction::DeviceDependences DDep;
- DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
- Action::OFK_Cuda);
- DeviceActions.push_back(
- C.MakeAction<OffloadAction>(DDep, A->getType()));
- }
- }
-
- // We generate the fat binary if we have device input actions.
- if (!DeviceActions.empty()) {
- CudaFatBinary =
- C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
-
- if (!CompileDeviceOnly) {
- DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
- Action::OFK_Cuda);
- // Clear the fat binary, it is already a dependence to an host
- // action.
- CudaFatBinary = nullptr;
- }
-
- // Remove the CUDA actions as they are already connected to an host
- // action or fat binary.
- CudaDeviceActions.clear();
- }
-
- // We avoid creating host action in device-only mode.
- return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
- } else if (CurPhase > phases::Backend) {
- // If we are past the backend phase and still have a device action, we
- // don't have to do anything as this action is already a device
- // top-level action.
- return ABRT_Success;
- }
-
- assert(CurPhase < phases::Backend && "Generating single CUDA "
- "instructions should only occur "
- "before the backend phase!");
-
- // By default, we produce an action for each device arch.
- for (Action *&A : CudaDeviceActions)
- A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
-
- return ABRT_Success;
- }
+ CudaActionBuilderBase(Compilation &C, DerivedArgList &Args,
+ const Driver::InputList &Inputs,
+ Action::OffloadKind OFKind)
+ : DeviceActionBuilder(C, Args, Inputs, OFKind) {}
ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
// While generating code for CUDA, we only depend on the host input action
@@ -2317,6 +2214,17 @@ class OffloadingActionBuilder final {
return ABRT_Success;
}
+ // If this is an unbundling action use it as is for each CUDA toolchain.
+ if (auto *UA = dyn_cast<OffloadUnbundlingJobAction>(HostAction)) {
+ CudaDeviceActions.clear();
+ for (auto Arch : GpuArchList) {
+ CudaDeviceActions.push_back(UA);
+ UA->registerDependentActionInfo(ToolChains[0], CudaArchToString(Arch),
+ AssociatedOffloadKind);
+ }
+ return ABRT_Success;
+ }
+
return IsActive ? ABRT_Success : ABRT_Inactive;
}
@@ -2325,7 +2233,7 @@ class OffloadingActionBuilder final {
auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
OffloadAction::DeviceDependences Dep;
Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
- Action::OFK_Cuda);
+ AssociatedOffloadKind);
AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
};
@@ -2354,8 +2262,17 @@ class OffloadingActionBuilder final {
}
bool initialize() override {
+ assert(AssociatedOffloadKind == Action::OFK_Cuda ||
+ AssociatedOffloadKind == Action::OFK_HIP);
+
// We don't need to support CUDA.
- if (!C.hasOffloadToolChain<Action::OFK_Cuda>())
+ if (AssociatedOffloadKind == Action::OFK_Cuda &&
+ !C.hasOffloadToolChain<Action::OFK_Cuda>())
+ return false;
+
+ // We don't need to support HIP.
+ if (AssociatedOffloadKind == Action::OFK_HIP &&
+ !C.hasOffloadToolChain<Action::OFK_HIP>())
return false;
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
@@ -2370,7 +2287,10 @@ class OffloadingActionBuilder final {
return true;
}
- ToolChains.push_back(C.getSingleOffloadToolChain<Action::OFK_Cuda>());
+ ToolChains.push_back(
+ AssociatedOffloadKind == Action::OFK_Cuda
+ ? C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+ : C.getSingleOffloadToolChain<Action::OFK_HIP>());
Arg *PartialCompilationArg = Args.getLastArg(
options::OPT_cuda_host_only, options::OPT_cuda_device_only,
@@ -2423,6 +2343,187 @@ class OffloadingActionBuilder final {
}
};
+ /// \brief CUDA action builder. It injects device code in the host backend
+ /// action.
+ class CudaActionBuilder final : public CudaActionBuilderBase {
+ public:
+ CudaActionBuilder(Compilation &C, DerivedArgList &Args,
+ const Driver::InputList &Inputs)
+ : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) {}
+
+ ActionBuilderReturnCode
+ getDeviceDependences(OffloadAction::DeviceDependences &DA,
+ phases::ID CurPhase, phases::ID FinalPhase,
+ PhasesTy &Phases) override {
+ if (!IsActive)
+ return ABRT_Inactive;
+
+ // If we don't have more CUDA actions, we don't have any dependences to
+ // create for the host.
+ if (CudaDeviceActions.empty())
+ return ABRT_Success;
+
+ assert(CudaDeviceActions.size() == GpuArchList.size() &&
+ "Expecting one action per GPU architecture.");
+ assert(!CompileHostOnly &&
+ "Not expecting CUDA actions in host-only compilation.");
+
+ // If we are generating code for the device or we are in a backend phase,
+ // we attempt to generate the fat binary. We compile each arch to ptx and
+ // assemble to cubin, then feed the cubin *and* the ptx into a device
+ // "link" action, which uses fatbinary to combine these cubins into one
+ // fatbin. The fatbin is then an input to the host action if not in
+ // device-only mode.
+ if (CompileDeviceOnly || CurPhase == phases::Backend) {
+ ActionList DeviceActions;
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+ // Produce the device action from the current phase up to the assemble
+ // phase.
+ for (auto Ph : Phases) {
+ // Skip the phases that were already dealt with.
+ if (Ph < CurPhase)
+ continue;
+ // We have to be consistent with the host final phase.
+ if (Ph > FinalPhase)
+ break;
+
+ CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
+ C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda);
+
+ if (Ph == phases::Assemble)
+ break;
+ }
+
+ // If we didn't reach the assemble phase, we can't generate the fat
+ // binary. We don't need to generate the fat binary if we are not in
+ // device-only mode.
+ if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
+ CompileDeviceOnly)
+ continue;
+
+ Action *AssembleAction = CudaDeviceActions[I];
+ assert(AssembleAction->getType() == types::TY_Object);
+ assert(AssembleAction->getInputs().size() == 1);
+
+ Action *BackendAction = AssembleAction->getInputs()[0];
+ assert(BackendAction->getType() == types::TY_PP_Asm);
+
+ for (auto &A : {AssembleAction, BackendAction}) {
+ OffloadAction::DeviceDependences DDep;
+ DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
+ Action::OFK_Cuda);
+ DeviceActions.push_back(
+ C.MakeAction<OffloadAction>(DDep, A->getType()));
+ }
+ }
+
+ // We generate the fat binary if we have device input actions.
+ if (!DeviceActions.empty()) {
+ CudaFatBinary =
+ C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
+ if (!CompileDeviceOnly) {
+ DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+ Action::OFK_Cuda);
+ // Clear the fat binary, it is already a dependence to an host
+ // action.
+ CudaFatBinary = nullptr;
+ }
+
+ // Remove the CUDA actions as they are already connected to an host
+ // action or fat binary.
+ CudaDeviceActions.clear();
+ }
+
+ // We avoid creating host action in device-only mode.
+ return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
+ } else if (CurPhase > phases::Backend) {
+ // If we are past the backend phase and still have a device action, we
+ // don't have to do anything as this action is already a device
+ // top-level action.
+ return ABRT_Success;
+ }
+
+ assert(CurPhase < phases::Backend && "Generating single CUDA "
+ "instructions should only occur "
+ "before the backend phase!");
+
+ // By default, we produce an action for each device arch.
+ for (Action *&A : CudaDeviceActions)
+ A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
+
+ return ABRT_Success;
+ }
+ };
+ /// \brief HIP action builder. It injects device code in the host backend
+ /// action.
+ class HIPActionBuilder final : public CudaActionBuilderBase {
+ /// The linker inputs obtained for each device arch.
+ SmallVector<ActionList, 8> DeviceLinkerInputs;
+
+ public:
+ HIPActionBuilder(Compilation &C, DerivedArgList &Args,
+ const Driver::InputList &Inputs)
+ : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+
+ bool canUseBundlerUnbundler() const override { return true; }
+
+ ActionBuilderReturnCode
+ getDeviceDependences(OffloadAction::DeviceDependences &DA,
+ phases::ID CurPhase, phases::ID FinalPhase,
+ PhasesTy &Phases) override {
+ // amdgcn does not support linking of object files, therefore we skip
+ // backend and assemble phases to output LLVM IR.
+ if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+ CurPhase == phases::Assemble)
+ return ABRT_Success;
+
+ assert((CurPhase == phases::Link ||
+ CudaDeviceActions.size() == GpuArchList.size()) &&
+ "Expecting one action per GPU architecture.");
+ assert(!CompileHostOnly &&
+ "Not expecting CUDA actions in host-only compilation.");
+
+ // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+ // This happens to each device action originated from each input file.
+ // Later on, device actions in DeviceLinkerInputs are used to create
+ // device link actions in appendLinkDependences and the created device
+ // link actions are passed to the offload action as device dependence.
+ if (CurPhase == phases::Link) {
+ DeviceLinkerInputs.resize(CudaDeviceActions.size());
+ auto LI = DeviceLinkerInputs.begin();
+ for (auto *A : CudaDeviceActions) {
+ LI->push_back(A);
+ ++LI;
+ }
+
+ // We will pass the device action as a host dependence, so we don't
+ // need to do anything else with them.
+ CudaDeviceActions.clear();
+ return ABRT_Success;
+ }
+
+ // By default, we produce an action for each device arch.
+ for (Action *&A : CudaDeviceActions)
+ A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
+ AssociatedOffloadKind);
+
+ return ABRT_Success;
+ }
+
+ void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {
+ // Append a new link action for each device.
+ unsigned I = 0;
+ for (auto &LI : DeviceLinkerInputs) {
+ auto *DeviceLinkAction =
+ C.MakeAction<LinkJobAction>(LI, types::TY_Image);
+ DA.add(*DeviceLinkAction, *ToolChains[0],
+ CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+ ++I;
+ }
+ }
+ };
+
/// OpenMP action builder. The host bitcode is passed to the device frontend
/// and all the device linked images are passed to the host link phase.
class OpenMPActionBuilder final : public DeviceActionBuilder {
@@ -2589,6 +2690,9 @@ public:
// Create a specialized builder for CUDA.
SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs));
+ // Create a specialized builder for HIP.
+ SpecializedBuilders.push_back(new HIPActionBuilder(C, Args, Inputs));
+
// Create a specialized builder for OpenMP.
SpecializedBuilders.push_back(new OpenMPActionBuilder(C, Args, Inputs));
Modified: cfe/trunk/test/Driver/cuda-phases.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-phases.cu?rev=333483&r1=333482&r2=333483&view=diff
==============================================================================
--- cfe/trunk/test/Driver/cuda-phases.cu (original)
+++ cfe/trunk/test/Driver/cuda-phases.cu Tue May 29 17:49:10 2018
@@ -7,195 +7,267 @@
// REQUIRES: clang-driver
// REQUIRES: powerpc-registered-target
// REQUIRES: nvptx-registered-target
-
+// REQUIRES: amdgpu-registered-target
//
// Test single gpu architecture with complete compilation.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
-// RUN: | FileCheck -check-prefix=BIN %s
-// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
-// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
-// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
-// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
-// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda)
-// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir
-// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda)
-// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda)
-// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s
+// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:sm_30]])
+// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:gfx803]])
+// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
+// BIN_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// BIN_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// BIN_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P7]]}, object
+// BIN_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH]])" {[[P6]]}, assembler
+// BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]])
+// BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir
+// BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
+// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
+// BIN_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
//
// Test single gpu architecture up to the assemble phase.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
-// RUN: | FileCheck -check-prefix=ASM %s
-// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda)
-// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda)
-// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
+// ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// ASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// ASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
+// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]])
+// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (host-[[T]])
+// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-[[T]])
+// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-[[T]])
//
// Test two gpu architectures with complete compilation.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
-// RUN: | FileCheck -check-prefix=BIN2 %s
-// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
-// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
-// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
-// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
-// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
-// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35)
-// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35)
-// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35)
-// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35)
-// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object
-// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler
-// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda)
-// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir
-// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda)
-// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda)
-// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s
+// BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH1:sm_30|gfx803]])
+// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
+// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH1]])
+// BIN2_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH1]])
+// BIN2_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH1]])
+// BIN2_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH1]])" {[[P7]]}, object
+// BIN2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH1]])" {[[P6]]}, assembler
+// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-[[T]], [[ARCH2]])
+// BIN2_NV-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-[[T]], [[ARCH2]])
+// BIN2_NV-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-[[T]], [[ARCH2]])
+// BIN2_NV-DAG: [[P15:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P14]]}, object
+// BIN2_NV-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P13]]}, assembler
+// BIN2_NV-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-[[T]])
+// BIN2_NV-DAG: [[P18:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P17]]}, ir
+// BIN2_NV-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-[[T]])
+// BIN2_AMD-DAG: [[P19:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]])
+// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-[[T]])
+// BIN2_AMD-DAG: [[P22:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH1]])
+// BIN2_AMD-DAG: [[P23:[0-9]+]]: linker, {[[P12]]}, image, (device-[[T]], [[ARCH2]])
+// BIN2_AMD-DAG: [[P24:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P21]]},
+// BIN2_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P22]]},
+// BIN2_AMD-DAG-SAME: "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH2]])" {[[P23]]}, object
//
// Test two gpu architecturess up to the assemble phase.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
-// RUN: | FileCheck -check-prefix=ASM2 %s
-// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
-// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
-// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
-// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
-// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda)
-// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda)
-// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s
+// ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]])
+// ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]])
+// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
+// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH1]])
+// ASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH1]])
+// ASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P3]]}, assembler
+// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
+// ASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
+// ASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler
+// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]])
+// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (host-[[T]])
+// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-[[T]])
+// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-[[T]])
//
// Test single gpu architecture with complete compilation in host-only
// compilation mode.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
-// RUN: | FileCheck -check-prefix=HBIN %s
-// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
-// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
-// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN,HBIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN,HBIN_AMD %s
+// HBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
+// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])
//
// Test single gpu architecture up to the assemble phase in host-only
// compilation mode.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=HASM %s
-// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=HASM,HASM_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=HASM,HASM_AMD %s
+// HASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
//
// Test two gpu architectures with complete compilation in host-only
// compilation mode.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
-// RUN: | FileCheck -check-prefix=HBIN2 %s
-// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
-// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
-// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_AMD %s
+// HBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
+// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])
//
// Test two gpu architectures up to the assemble phase in host-only
// compilation mode.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=HASM2 %s
-// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S \
+// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S \
+// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_AMD %s
+// HASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
//
// Test single gpu architecture with complete compilation in device-only
// compilation mode.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
-// RUN: | FileCheck -check-prefix=DBIN %s
-// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
-// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=DBIN,DBIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=DBIN,DBIN_AMD %s
+// DBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DBIN_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DBIN_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
+// DBIN_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object
//
// Test single gpu architecture up to the assemble phase in device-only
// compilation mode.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=DASM %s
-// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM,DASM_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM,DASM_AMD %s
+// DASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
//
// Test two gpu architectures with complete compilation in device-only
// compilation mode.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
-// RUN: | FileCheck -check-prefix=DBIN2 %s
-// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
-// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
-// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35)
-// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35)
-// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35)
-// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35)
-// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=DBIN2,DBIN2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only \
+// RUN: 2>&1 | FileCheck -check-prefixes=DBIN2,DBIN2_AMD %s
+// DBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DBIN2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DBIN2_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
+// DBIN2_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P4]]}, object
+// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
+// DBIN2_NV-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-[[T]], [[ARCH2]])
+// DBIN2_NV-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-[[T]], [[ARCH2]])
+// DBIN2_NV-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P10]]}, object
//
// Test two gpu architectures up to the assemble phase in device-only
// compilation mode.
//
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=DASM2 %s
-// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
-// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
-// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
-// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S \
+// RUN: 2>&1 | FileCheck -check-prefixes=DASM2,DASM2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2_AMD %s
+// DASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
+// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
+// DASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
+// DASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler
More information about the cfe-commits
mailing list