r333483 - Add action builder for HIP

Tue May 29 17:49:10 PDT 2018

Author: yaxunl
Date: Tue May 29 17:49:10 2018
New Revision: 333483

URL: http://llvm.org/viewvc/llvm-project?rev=333483&view=rev
Log:
Add action builder for HIP

To support separate compile/link and linking across device IR in different source files,
a new HIP action builder is introduced. Basically it compiles/links host and device
code separately, and embed fat binary in host linking stage through linker script.

Differential Revision: https://reviews.llvm.org/D46476

Modified:
    cfe/trunk/lib/Driver/Driver.cpp
    cfe/trunk/test/Driver/cuda-phases.cu

Modified: cfe/trunk/lib/Driver/Driver.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Driver.cpp?rev=333483&r1=333482&r2=333483&view=diff
==============================================================================

--- cfe/trunk/lib/Driver/Driver.cpp (original)
+++ cfe/trunk/lib/Driver/Driver.cpp Tue May 29 17:49:10 2018
@@ -2151,9 +2151,10 @@ class OffloadingActionBuilder final {
     }
   };
 
-  /// CUDA action builder. It injects device code in the host backend
-  /// action.
-  class CudaActionBuilder final : public DeviceActionBuilder {
+  /// Base class for CUDA/HIP action builder. It injects device code in
+  /// the host backend action.
+  class CudaActionBuilderBase : public DeviceActionBuilder {
+  protected:
     /// Flags to signal if the user requested host-only or device-only
     /// compilation.
     bool CompileHostOnly = false;
@@ -2170,115 +2171,11 @@ class OffloadingActionBuilder final {
 
     /// Flag that is set to true if this builder acted on the current input.
     bool IsActive = false;
-
   public:
-    CudaActionBuilder(Compilation &C, DerivedArgList &Args,
-                      const Driver::InputList &Inputs)
-        : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {}
-
-    ActionBuilderReturnCode
-    getDeviceDependences(OffloadAction::DeviceDependences &DA,
-                         phases::ID CurPhase, phases::ID FinalPhase,
-                         PhasesTy &Phases) override {
-      if (!IsActive)
-        return ABRT_Inactive;
-
-      // If we don't have more CUDA actions, we don't have any dependences to
-      // create for the host.
-      if (CudaDeviceActions.empty())
-        return ABRT_Success;
-
-      assert(CudaDeviceActions.size() == GpuArchList.size() &&
-             "Expecting one action per GPU architecture.");
-      assert(!CompileHostOnly &&
-             "Not expecting CUDA actions in host-only compilation.");
-
-      // If we are generating code for the device or we are in a backend phase,
-      // we attempt to generate the fat binary. We compile each arch to ptx and
-      // assemble to cubin, then feed the cubin *and* the ptx into a device
-      // "link" action, which uses fatbinary to combine these cubins into one
-      // fatbin.  The fatbin is then an input to the host action if not in
-      // device-only mode.
-      if (CompileDeviceOnly || CurPhase == phases::Backend) {
-        ActionList DeviceActions;
-        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
-          // Produce the device action from the current phase up to the assemble
-          // phase.
-          for (auto Ph : Phases) {
-            // Skip the phases that were already dealt with.
-            if (Ph < CurPhase)
-              continue;
-            // We have to be consistent with the host final phase.
-            if (Ph > FinalPhase)
-              break;
-
-            CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
-                C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda);
-
-            if (Ph == phases::Assemble)
-              break;
-          }
-
-          // If we didn't reach the assemble phase, we can't generate the fat
-          // binary. We don't need to generate the fat binary if we are not in
-          // device-only mode.
-          if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
-              CompileDeviceOnly)
-            continue;
-
-          Action *AssembleAction = CudaDeviceActions[I];
-          assert(AssembleAction->getType() == types::TY_Object);
-          assert(AssembleAction->getInputs().size() == 1);
-
-          Action *BackendAction = AssembleAction->getInputs()[0];
-          assert(BackendAction->getType() == types::TY_PP_Asm);
-
-          for (auto &A : {AssembleAction, BackendAction}) {
-            OffloadAction::DeviceDependences DDep;
-            DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
-                     Action::OFK_Cuda);
-            DeviceActions.push_back(
-                C.MakeAction<OffloadAction>(DDep, A->getType()));
-          }
-        }
-
-        // We generate the fat binary if we have device input actions.
-        if (!DeviceActions.empty()) {
-          CudaFatBinary =
-              C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
-
-          if (!CompileDeviceOnly) {
-            DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
-                   Action::OFK_Cuda);
-            // Clear the fat binary, it is already a dependence to an host
-            // action.
-            CudaFatBinary = nullptr;
-          }
-
-          // Remove the CUDA actions as they are already connected to an host
-          // action or fat binary.
-          CudaDeviceActions.clear();
-        }
-
-        // We avoid creating host action in device-only mode.
-        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
-      } else if (CurPhase > phases::Backend) {
-        // If we are past the backend phase and still have a device action, we
-        // don't have to do anything as this action is already a device
-        // top-level action.
-        return ABRT_Success;
-      }
-
-      assert(CurPhase < phases::Backend && "Generating single CUDA "
-                                           "instructions should only occur "
-                                           "before the backend phase!");
-
-      // By default, we produce an action for each device arch.
-      for (Action *&A : CudaDeviceActions)
-        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
-
-      return ABRT_Success;
-    }
+    CudaActionBuilderBase(Compilation &C, DerivedArgList &Args,
+                          const Driver::InputList &Inputs,
+                          Action::OffloadKind OFKind)
+        : DeviceActionBuilder(C, Args, Inputs, OFKind) {}
 
     ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
       // While generating code for CUDA, we only depend on the host input action
@@ -2317,6 +2214,17 @@ class OffloadingActionBuilder final {
         return ABRT_Success;
       }
 
+      // If this is an unbundling action use it as is for each CUDA toolchain.
+      if (auto *UA = dyn_cast<OffloadUnbundlingJobAction>(HostAction)) {
+        CudaDeviceActions.clear();
+        for (auto Arch : GpuArchList) {
+          CudaDeviceActions.push_back(UA);
+          UA->registerDependentActionInfo(ToolChains[0], CudaArchToString(Arch),
+                                          AssociatedOffloadKind);
+        }
+        return ABRT_Success;
+      }
+
       return IsActive ? ABRT_Success : ABRT_Inactive;
     }
 
@@ -2325,7 +2233,7 @@ class OffloadingActionBuilder final {
       auto AddTopLevel = [&](Action *A, CudaArch BoundArch) {
         OffloadAction::DeviceDependences Dep;
         Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch),
-                Action::OFK_Cuda);
+                AssociatedOffloadKind);
         AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
       };
 
@@ -2354,8 +2262,17 @@ class OffloadingActionBuilder final {
     }
 
     bool initialize() override {
+      assert(AssociatedOffloadKind == Action::OFK_Cuda ||
+             AssociatedOffloadKind == Action::OFK_HIP);
+
       // We don't need to support CUDA.
-      if (!C.hasOffloadToolChain<Action::OFK_Cuda>())
+      if (AssociatedOffloadKind == Action::OFK_Cuda &&
+          !C.hasOffloadToolChain<Action::OFK_Cuda>())
+        return false;
+
+      // We don't need to support HIP.
+      if (AssociatedOffloadKind == Action::OFK_HIP &&
+          !C.hasOffloadToolChain<Action::OFK_HIP>())
         return false;
 
       const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
@@ -2370,7 +2287,10 @@ class OffloadingActionBuilder final {
         return true;
       }
 
-      ToolChains.push_back(C.getSingleOffloadToolChain<Action::OFK_Cuda>());
+      ToolChains.push_back(
+          AssociatedOffloadKind == Action::OFK_Cuda
+              ? C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+              : C.getSingleOffloadToolChain<Action::OFK_HIP>());
 
       Arg *PartialCompilationArg = Args.getLastArg(
           options::OPT_cuda_host_only, options::OPT_cuda_device_only,
@@ -2423,6 +2343,187 @@ class OffloadingActionBuilder final {
     }
   };
 
+  /// \brief CUDA action builder. It injects device code in the host backend
+  /// action.
+  class CudaActionBuilder final : public CudaActionBuilderBase {
+  public:
+    CudaActionBuilder(Compilation &C, DerivedArgList &Args,
+                      const Driver::InputList &Inputs)
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_Cuda) {}
+
+    ActionBuilderReturnCode
+    getDeviceDependences(OffloadAction::DeviceDependences &DA,
+                         phases::ID CurPhase, phases::ID FinalPhase,
+                         PhasesTy &Phases) override {
+      if (!IsActive)
+        return ABRT_Inactive;
+
+      // If we don't have more CUDA actions, we don't have any dependences to
+      // create for the host.
+      if (CudaDeviceActions.empty())
+        return ABRT_Success;
+
+      assert(CudaDeviceActions.size() == GpuArchList.size() &&
+             "Expecting one action per GPU architecture.");
+      assert(!CompileHostOnly &&
+             "Not expecting CUDA actions in host-only compilation.");
+
+      // If we are generating code for the device or we are in a backend phase,
+      // we attempt to generate the fat binary. We compile each arch to ptx and
+      // assemble to cubin, then feed the cubin *and* the ptx into a device
+      // "link" action, which uses fatbinary to combine these cubins into one
+      // fatbin.  The fatbin is then an input to the host action if not in
+      // device-only mode.
+      if (CompileDeviceOnly || CurPhase == phases::Backend) {
+        ActionList DeviceActions;
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          // Produce the device action from the current phase up to the assemble
+          // phase.
+          for (auto Ph : Phases) {
+            // Skip the phases that were already dealt with.
+            if (Ph < CurPhase)
+              continue;
+            // We have to be consistent with the host final phase.
+            if (Ph > FinalPhase)
+              break;
+
+            CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
+                C, Args, Ph, CudaDeviceActions[I], Action::OFK_Cuda);
+
+            if (Ph == phases::Assemble)
+              break;
+          }
+
+          // If we didn't reach the assemble phase, we can't generate the fat
+          // binary. We don't need to generate the fat binary if we are not in
+          // device-only mode.
+          if (!isa<AssembleJobAction>(CudaDeviceActions[I]) ||
+              CompileDeviceOnly)
+            continue;
+
+          Action *AssembleAction = CudaDeviceActions[I];
+          assert(AssembleAction->getType() == types::TY_Object);
+          assert(AssembleAction->getInputs().size() == 1);
+
+          Action *BackendAction = AssembleAction->getInputs()[0];
+          assert(BackendAction->getType() == types::TY_PP_Asm);
+
+          for (auto &A : {AssembleAction, BackendAction}) {
+            OffloadAction::DeviceDependences DDep;
+            DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]),
+                     Action::OFK_Cuda);
+            DeviceActions.push_back(
+                C.MakeAction<OffloadAction>(DDep, A->getType()));
+          }
+        }
+
+        // We generate the fat binary if we have device input actions.
+        if (!DeviceActions.empty()) {
+          CudaFatBinary =
+              C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
+          if (!CompileDeviceOnly) {
+            DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+                   Action::OFK_Cuda);
+            // Clear the fat binary, it is already a dependence to an host
+            // action.
+            CudaFatBinary = nullptr;
+          }
+
+          // Remove the CUDA actions as they are already connected to an host
+          // action or fat binary.
+          CudaDeviceActions.clear();
+        }
+
+        // We avoid creating host action in device-only mode.
+        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
+      } else if (CurPhase > phases::Backend) {
+        // If we are past the backend phase and still have a device action, we
+        // don't have to do anything as this action is already a device
+        // top-level action.
+        return ABRT_Success;
+      }
+
+      assert(CurPhase < phases::Backend && "Generating single CUDA "
+                                           "instructions should only occur "
+                                           "before the backend phase!");
+
+      // By default, we produce an action for each device arch.
+      for (Action *&A : CudaDeviceActions)
+        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
+
+      return ABRT_Success;
+    }
+  };
+  /// \brief HIP action builder. It injects device code in the host backend
+  /// action.
+  class HIPActionBuilder final : public CudaActionBuilderBase {
+    /// The linker inputs obtained for each device arch.
+    SmallVector<ActionList, 8> DeviceLinkerInputs;
+
+  public:
+    HIPActionBuilder(Compilation &C, DerivedArgList &Args,
+                     const Driver::InputList &Inputs)
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+
+    bool canUseBundlerUnbundler() const override { return true; }
+
+    ActionBuilderReturnCode
+    getDeviceDependences(OffloadAction::DeviceDependences &DA,
+                         phases::ID CurPhase, phases::ID FinalPhase,
+                         PhasesTy &Phases) override {
+      // amdgcn does not support linking of object files, therefore we skip
+      // backend and assemble phases to output LLVM IR.
+      if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+          CurPhase == phases::Assemble)
+        return ABRT_Success;
+
+      assert((CurPhase == phases::Link ||
+              CudaDeviceActions.size() == GpuArchList.size()) &&
+             "Expecting one action per GPU architecture.");
+      assert(!CompileHostOnly &&
+             "Not expecting CUDA actions in host-only compilation.");
+
+      // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+      // This happens to each device action originated from each input file.
+      // Later on, device actions in DeviceLinkerInputs are used to create
+      // device link actions in appendLinkDependences and the created device
+      // link actions are passed to the offload action as device dependence.
+      if (CurPhase == phases::Link) {
+        DeviceLinkerInputs.resize(CudaDeviceActions.size());
+        auto LI = DeviceLinkerInputs.begin();
+        for (auto *A : CudaDeviceActions) {
+          LI->push_back(A);
+          ++LI;
+        }
+
+        // We will pass the device action as a host dependence, so we don't
+        // need to do anything else with them.
+        CudaDeviceActions.clear();
+        return ABRT_Success;
+      }
+
+      // By default, we produce an action for each device arch.
+      for (Action *&A : CudaDeviceActions)
+        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
+                                               AssociatedOffloadKind);
+
+      return ABRT_Success;
+    }
+
+    void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {
+      // Append a new link action for each device.
+      unsigned I = 0;
+      for (auto &LI : DeviceLinkerInputs) {
+        auto *DeviceLinkAction =
+            C.MakeAction<LinkJobAction>(LI, types::TY_Image);
+        DA.add(*DeviceLinkAction, *ToolChains[0],
+               CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+        ++I;
+      }
+    }
+  };
+
   /// OpenMP action builder. The host bitcode is passed to the device frontend
   /// and all the device linked images are passed to the host link phase.
   class OpenMPActionBuilder final : public DeviceActionBuilder {
@@ -2589,6 +2690,9 @@ public:
     // Create a specialized builder for CUDA.
     SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs));
 
+    // Create a specialized builder for HIP.
+    SpecializedBuilders.push_back(new HIPActionBuilder(C, Args, Inputs));
+
     // Create a specialized builder for OpenMP.
     SpecializedBuilders.push_back(new OpenMPActionBuilder(C, Args, Inputs));
 

Modified: cfe/trunk/test/Driver/cuda-phases.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-phases.cu?rev=333483&r1=333482&r2=333483&view=diff
==============================================================================
--- cfe/trunk/test/Driver/cuda-phases.cu (original)
+++ cfe/trunk/test/Driver/cuda-phases.cu Tue May 29 17:49:10 2018
@@ -7,195 +7,267 @@
 // REQUIRES: clang-driver
 // REQUIRES: powerpc-registered-target
 // REQUIRES: nvptx-registered-target
-
+// REQUIRES: amdgpu-registered-target
 //
 // Test single gpu architecture with complete compilation.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
-// RUN: | FileCheck -check-prefix=BIN %s
-// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
-// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
-// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
-// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
-// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda)
-// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir
-// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda)
-// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda)
-// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,BIN_AMD %s
+// BIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// BIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// BIN_NV-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:sm_30]])
+// BIN_AMD-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH:gfx803]])
+// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
+// BIN_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// BIN_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// BIN_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P7]]}, object
+// BIN_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH]])" {[[P6]]}, assembler
+// BIN_NV-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-[[T]])
+// BIN_NV-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P10]]}, ir
+// BIN_NV-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// BIN_AMD-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// BIN_AMD-DAG: [[P15:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH]])
+// BIN_AMD-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P14]]},
+// BIN_AMD-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P15]]}, object
 
 //
 // Test single gpu architecture up to the assemble phase.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
-// RUN: | FileCheck -check-prefix=ASM %s
-// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda)
-// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda)
-// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM,ASM_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM,ASM_AMD %s
+// ASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// ASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// ASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// ASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
+// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]])
+// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (host-[[T]])
+// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-[[T]])
+// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-[[T]])
 
 //
 // Test two gpu architectures with complete compilation.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
-// RUN: | FileCheck -check-prefix=BIN2 %s
-// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30)
-// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30)
-// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30)
-// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30)
-// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object
-// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler
-// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35)
-// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35)
-// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35)
-// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35)
-// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object
-// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler
-// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda)
-// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir
-// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda)
-// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda)
-// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN2,BIN2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN2,BIN2_AMD %s
+// BIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// BIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH1:sm_30|gfx803]])
+// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
+// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH1]])
+// BIN2_NV-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH1]])
+// BIN2_NV-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH1]])
+// BIN2_NV-DAG: [[P8:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH1]])" {[[P7]]}, object
+// BIN2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH1]])" {[[P6]]}, assembler
+// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-[[T]], [[ARCH2]])
+// BIN2_NV-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-[[T]], [[ARCH2]])
+// BIN2_NV-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-[[T]], [[ARCH2]])
+// BIN2_NV-DAG: [[P15:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P14]]}, object
+// BIN2_NV-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P13]]}, assembler
+// BIN2_NV-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-[[T]])
+// BIN2_NV-DAG: [[P18:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-[[T]] ([[TRIPLE]])" {[[P17]]}, ir
+// BIN2_NV-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-[[T]])
+// BIN2_AMD-DAG: [[P19:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]])
+// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-[[T]])
+// BIN2_AMD-DAG: [[P22:[0-9]+]]: linker, {[[P5]]}, image, (device-[[T]], [[ARCH1]])
+// BIN2_AMD-DAG: [[P23:[0-9]+]]: linker, {[[P12]]}, image, (device-[[T]], [[ARCH2]])
+// BIN2_AMD-DAG: [[P24:[0-9]+]]: offload, "host-[[T]] (powerpc64le-ibm-linux-gnu)" {[[P21]]},
+// BIN2_AMD-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P22]]},
+// BIN2_AMD-DAG-SAME:  "device-[[T]] ([[TRIPLE:amdgcn-amd-amdhsa]]:[[ARCH2]])" {[[P23]]}, object
 
 //
 // Test two gpu architecturess up to the assemble phase.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
-// RUN: | FileCheck -check-prefix=ASM2 %s
-// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
-// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
-// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
-// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
-// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda)
-// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda)
-// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM2,ASM2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=ASM2,ASM2_AMD %s
+// ASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH1:sm_30]])
+// ASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]])
+// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
+// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH1]])
+// ASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH1]])
+// ASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH1]])" {[[P3]]}, assembler
+// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
+// ASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
+// ASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler
+// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (host-[[T]])
+// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (host-[[T]])
+// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-[[T]])
+// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-[[T]])
 
 //
 // Test single gpu architecture with complete compilation in host-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
-// RUN: | FileCheck -check-prefix=HBIN %s
-// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
-// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
-// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN,HBIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN,HBIN_AMD %s
+// HBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
+// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])
 //
 // Test single gpu architecture up to the assemble phase in host-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=HASM %s
-// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=HASM,HASM_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=HASM,HASM_AMD %s
+// HASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
 
 //
 // Test two gpu architectures with complete compilation in host-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
-// RUN: | FileCheck -check-prefix=HBIN2 %s
-// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
-// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda)
-// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=HBIN2,HBIN2_AMD %s
+// HBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
+// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])
 
 //
 // Test two gpu architectures up to the assemble phase in host-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=HASM2 %s
-// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda)
-// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda)
-// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda)
-// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda)
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S \
+// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-host-only -S \
+// RUN: 2>&1 | FileCheck -check-prefixes=HASM2,HASM2_AMD %s
+// HASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (host-[[T]])
+// HASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (host-[[T]])
+// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
 
 //
 // Test single gpu architecture with complete compilation in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
-// RUN: | FileCheck -check-prefix=DBIN %s
-// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
-// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=DBIN,DBIN_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=DBIN,DBIN_AMD %s
+// DBIN_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DBIN_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DBIN_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DBIN_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
+// DBIN_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object
 
 //
 // Test single gpu architecture up to the assemble phase in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=DASM %s
-// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM,DASM_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM,DASM_AMD %s
+// DASM_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DASM_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
 
 //
 // Test two gpu architectures with complete compilation in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
-// RUN: | FileCheck -check-prefix=DBIN2 %s
-// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30)
-// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object
-// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35)
-// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35)
-// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35)
-// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35)
-// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefixes=DBIN2,DBIN2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --cuda-device-only \
+// RUN: 2>&1 | FileCheck -check-prefixes=DBIN2,DBIN2_AMD %s
+// DBIN2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DBIN2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DBIN2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DBIN2_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
+// DBIN2_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda]]:[[ARCH]])" {[[P4]]}, object
+// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
+// DBIN2_NV-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-[[T]], [[ARCH2]])
+// DBIN2_NV-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-[[T]], [[ARCH2]])
+// DBIN2_NV-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P10]]}, object
 
 //
 // Test two gpu architectures up to the assemble phase in device-only
 // compilation mode.
 //
-// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
-// RUN: | FileCheck -check-prefix=DASM2 %s
-// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30)
-// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30)
-// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30)
-// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30)
-// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler
-// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35)
-// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35)
-// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35)
-// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35)
-// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases \
+// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S \
+// RUN: 2>&1 | FileCheck -check-prefixes=DASM2,DASM2_NV %s
+// RUN: %clang -x hip -target powerpc64le-ibm-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2_AMD %s
+// DASM2_NV-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:cuda]], (device-[[T]], [[ARCH:sm_30]])
+// DASM2_AMD-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// DASM2_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
+// DASM2_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
+// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", [[T]], (device-[[T]], [[ARCH2:sm_35|gfx900]])
+// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
+// DASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
+// DASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler