[clang] 77df5a8 - [HIP] Move HIP Linking Logic into HIP ToolChain

Aaron En Ye Shi via cfe-commits cfe-commits at lists.llvm.org
Mon Jun 22 12:49:29 PDT 2020


Author: Aaron En Ye Shi
Date: 2020-06-22T19:48:48Z
New Revision: 77df5a8283edbfc33ad3b12df3bd42d54d7ba4f4

URL: https://github.com/llvm/llvm-project/commit/77df5a8283edbfc33ad3b12df3bd42d54d7ba4f4
DIFF: https://github.com/llvm/llvm-project/commit/77df5a8283edbfc33ad3b12df3bd42d54d7ba4f4.diff

LOG: [HIP] Move HIP Linking Logic into HIP ToolChain

This patch is a follow up on https://reviews.llvm.org/D78759.

Extract the HIP Linker script from generic GNU linker,
and move it into HIP ToolChain. Update OffloadActionBuilder
Link actions feature to apply device linking and host linking
actions separately. Using MC Directives, embed the device images
and define symbols.

Reviewers: JonChesterfield, yaxunl

Subscribers: tra, echristo, jdoerfert, msearles, scchan

Differential Revision: https://reviews.llvm.org/D81963

Added: 
    

Modified: 
    clang/lib/Driver/Driver.cpp
    clang/lib/Driver/ToolChains/CommonArgs.cpp
    clang/lib/Driver/ToolChains/CommonArgs.h
    clang/lib/Driver/ToolChains/Gnu.cpp
    clang/lib/Driver/ToolChains/HIP.cpp
    clang/lib/Driver/ToolChains/HIP.h
    clang/test/Driver/hip-binding.hip
    clang/test/Driver/hip-link-save-temps.hip
    clang/test/Driver/hip-link-shared-library.hip
    clang/test/Driver/hip-phases.hip
    clang/test/Driver/hip-save-temps.hip
    clang/test/Driver/hip-toolchain-rdc-separate.hip
    clang/test/Driver/hip-toolchain-rdc.hip

Removed: 
    


################################################################################
diff  --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index a48761af400f..de732918fc76 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2325,8 +2325,11 @@ class OffloadingActionBuilder final {
     /// Append top level actions generated by the builder.
     virtual void appendTopLevelActions(ActionList &AL) {}
 
-    /// Append linker actions generated by the builder.
-    virtual void appendLinkActions(ActionList &AL) {}
+    /// Append linker device actions generated by the builder.
+    virtual void appendLinkDeviceActions(ActionList &AL) {}
+
+    /// Append linker host action generated by the builder.
+    virtual Action* appendLinkHostActions(ActionList &AL) { return nullptr; }
 
     /// Append linker actions generated by the builder.
     virtual void appendLinkDependences(OffloadAction::DeviceDependences &DA) {}
@@ -2796,17 +2799,45 @@ class OffloadingActionBuilder final {
                                                            : ABRT_Success;
     }
 
-    void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {
+    void appendLinkDeviceActions(ActionList &AL) override {
+      if (DeviceLinkerInputs.size() == 0)
+        return;
+
+      assert(DeviceLinkerInputs.size() == GpuArchList.size() &&
+             "Linker inputs and GPU arch list sizes do not match.");
+
       // Append a new link action for each device.
       unsigned I = 0;
       for (auto &LI : DeviceLinkerInputs) {
+        // Each entry in DeviceLinkerInputs corresponds to a GPU arch.
         auto *DeviceLinkAction =
             C.MakeAction<LinkJobAction>(LI, types::TY_Image);
-        DA.add(*DeviceLinkAction, *ToolChains[0],
-               CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+        // Linking all inputs for the current GPU arch.
+        // LI contains all the inputs for the linker.
+        OffloadAction::DeviceDependences DeviceLinkDeps;
+        DeviceLinkDeps.add(*DeviceLinkAction, *ToolChains[0],
+            CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+        AL.push_back(C.MakeAction<OffloadAction>(DeviceLinkDeps,
+            DeviceLinkAction->getType()));
         ++I;
       }
+      DeviceLinkerInputs.clear();
+
+      // Create a host object from all the device images by embedding them
+      // in a fat binary.
+      OffloadAction::DeviceDependences DDeps;
+      auto *TopDeviceLinkAction =
+          C.MakeAction<LinkJobAction>(AL, types::TY_Object);
+      DDeps.add(*TopDeviceLinkAction, *ToolChains[0],
+          nullptr, AssociatedOffloadKind);
+
+      // Offload the host object to the host linker.
+      AL.push_back(C.MakeAction<OffloadAction>(DDeps, TopDeviceLinkAction->getType()));
     }
+
+    Action* appendLinkHostActions(ActionList &AL) override { return AL.back(); }
+
+    void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {}
   };
 
   /// OpenMP action builder. The host bitcode is passed to the device frontend
@@ -2934,7 +2965,7 @@ class OffloadingActionBuilder final {
       OpenMPDeviceActions.clear();
     }
 
-    void appendLinkActions(ActionList &AL) override {
+    void appendLinkDeviceActions(ActionList &AL) override {
       assert(ToolChains.size() == DeviceLinkerInputs.size() &&
              "Toolchains and linker inputs sizes do not match.");
 
@@ -2953,6 +2984,14 @@ class OffloadingActionBuilder final {
       DeviceLinkerInputs.clear();
     }
 
+    Action* appendLinkHostActions(ActionList &AL) override {
+      // Create wrapper bitcode from the result of device link actions and compile
+      // it to an object which will be added to the host link command.
+      auto *BC = C.MakeAction<OffloadWrapperJobAction>(AL, types::TY_LLVM_BC);
+      auto *ASM = C.MakeAction<BackendJobAction>(BC, types::TY_PP_Asm);
+      return C.MakeAction<AssembleJobAction>(ASM, types::TY_Object);
+    }
+
     void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {}
 
     bool initialize() override {
@@ -3185,17 +3224,20 @@ class OffloadingActionBuilder final {
     for (DeviceActionBuilder *SB : SpecializedBuilders) {
       if (!SB->isValid())
         continue;
-      SB->appendLinkActions(DeviceAL);
+      SB->appendLinkDeviceActions(DeviceAL);
     }
 
     if (DeviceAL.empty())
       return nullptr;
 
-    // Create wrapper bitcode from the result of device link actions and compile
-    // it to an object which will be added to the host link command.
-    auto *BC = C.MakeAction<OffloadWrapperJobAction>(DeviceAL, types::TY_LLVM_BC);
-    auto *ASM = C.MakeAction<BackendJobAction>(BC, types::TY_PP_Asm);
-    return C.MakeAction<AssembleJobAction>(ASM, types::TY_Object);
+    // Let builders add host linking actions.
+    Action* HA;
+    for (DeviceActionBuilder *SB : SpecializedBuilders) {
+      if (!SB->isValid())
+        continue;
+      HA = SB->appendLinkHostActions(DeviceAL);
+    }
+    return HA;
   }
 
   /// Processes the host linker action. This currently consists of replacing it

diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 7fdb3fdf0009..542fb67db7ef 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -152,14 +152,12 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
     addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
 
   for (const auto &II : Inputs) {
-    // If the current tool chain refers to an OpenMP or HIP offloading host, we
-    // should ignore inputs that refer to OpenMP or HIP offloading devices -
+    // If the current tool chain refers to an OpenMP offloading host, we
+    // should ignore inputs that refer to OpenMP offloading devices -
     // they will be embedded according to a proper linker script.
     if (auto *IA = II.getAction())
       if ((JA.isHostOffloading(Action::OFK_OpenMP) &&
-           IA->isDeviceOffloading(Action::OFK_OpenMP)) ||
-          (JA.isHostOffloading(Action::OFK_HIP) &&
-           IA->isDeviceOffloading(Action::OFK_HIP)))
+           IA->isDeviceOffloading(Action::OFK_OpenMP)))
         continue;
 
     if (!TC.HasNativeLLVMSupport() && types::isLLVMIR(II.getType()))
@@ -1298,115 +1296,6 @@ void tools::AddRunTimeLibs(const ToolChain &TC, const Driver &D,
   }
 }
 
-/// Add HIP linker script arguments at the end of the argument list so that
-/// the fat binary is built by embedding the device images into the host. The
-/// linker script also defines a symbol required by the code generation so that
-/// the image can be retrieved at runtime. This should be used only in tool
-/// chains that support linker scripts.
-void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
-                               const InputInfo &Output,
-                               const InputInfoList &Inputs, const ArgList &Args,
-                               ArgStringList &CmdArgs, const JobAction &JA,
-                               const Tool &T) {
-
-  // If this is not a HIP host toolchain, we don't need to do anything.
-  if (!JA.isHostOffloading(Action::OFK_HIP))
-    return;
-
-  InputInfoList DeviceInputs;
-  for (const auto &II : Inputs) {
-    const Action *A = II.getAction();
-    // Is this a device linking action?
-    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
-      DeviceInputs.push_back(II);
-    }
-  }
-
-  if (DeviceInputs.empty())
-    return;
-
-  // Create temporary linker script. Keep it if save-temps is enabled.
-  const char *LKS;
-  std::string Name =
-      std::string(llvm::sys::path::filename(Output.getFilename()));
-  if (C.getDriver().isSaveTempsEnabled()) {
-    LKS = C.getArgs().MakeArgString(Name + ".lk");
-  } else {
-    auto TmpName = C.getDriver().GetTemporaryPath(Name, "lk");
-    LKS = C.addTempFile(C.getArgs().MakeArgString(TmpName));
-  }
-
-  // Add linker script option to the command.
-  CmdArgs.push_back("-T");
-  CmdArgs.push_back(LKS);
-
-  // Create a buffer to write the contents of the linker script.
-  std::string LksBuffer;
-  llvm::raw_string_ostream LksStream(LksBuffer);
-
-  // Get the HIP offload tool chain.
-  auto *HIPTC = static_cast<const toolchains::HIPToolChain *>(
-      C.getSingleOffloadToolChain<Action::OFK_HIP>());
-  assert(HIPTC->getTriple().getArch() == llvm::Triple::amdgcn &&
-         "Wrong platform");
-  (void)HIPTC;
-
-  const char *BundleFile;
-  if (C.getDriver().isSaveTempsEnabled()) {
-    BundleFile = C.getArgs().MakeArgString(Name + ".hipfb");
-  } else {
-    auto TmpName = C.getDriver().GetTemporaryPath(Name, "hipfb");
-    BundleFile = C.addTempFile(C.getArgs().MakeArgString(TmpName));
-  }
-  AMDGCN::constructHIPFatbinCommand(C, JA, BundleFile, DeviceInputs, Args, T);
-
-  // Add commands to embed target binaries. We ensure that each section and
-  // image is 16-byte aligned. This is not mandatory, but increases the
-  // likelihood of data to be aligned with a cache block in several main host
-  // machines.
-  LksStream << "/*\n";
-  LksStream << "       HIP Offload Linker Script\n";
-  LksStream << " *** Automatically generated by Clang ***\n";
-  LksStream << "*/\n";
-  LksStream << "TARGET(binary)\n";
-  LksStream << "INPUT(" << BundleFile << ")\n";
-  LksStream << "SECTIONS\n";
-  LksStream << "{\n";
-  LksStream << "  .hip_fatbin :\n";
-  LksStream << "  ALIGN(0x10)\n";
-  LksStream << "  {\n";
-  LksStream << "    PROVIDE_HIDDEN(__hip_fatbin = .);\n";
-  LksStream << "    " << BundleFile << "\n";
-  LksStream << "  }\n";
-  LksStream << "  /DISCARD/ :\n";
-  LksStream << "  {\n";
-  LksStream << "    * ( __CLANG_OFFLOAD_BUNDLE__* )\n";
-  LksStream << "  }\n";
-  LksStream << "}\n";
-  LksStream << "INSERT BEFORE .data\n";
-  LksStream.flush();
-
-  // Dump the contents of the linker script if the user requested that. We
-  // support this option to enable testing of behavior with -###.
-  if (C.getArgs().hasArg(options::OPT_fhip_dump_offload_linker_script))
-    llvm::errs() << LksBuffer;
-
-  // If this is a dry run, do not create the linker script file.
-  if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
-    return;
-
-  // Open script file and write the contents.
-  std::error_code EC;
-  llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::OF_None);
-
-  if (EC) {
-    C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
-    return;
-  }
-
-  Lksf << LksBuffer;
-}
-
 SmallString<128> tools::getStatsFileName(const llvm::opt::ArgList &Args,
                                          const InputInfo &Output,
                                          const InputInfo &Input,

diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index 58bc92c9b756..c7d695ebf415 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -45,12 +45,6 @@ void AddRunTimeLibs(const ToolChain &TC, const Driver &D,
                     llvm::opt::ArgStringList &CmdArgs,
                     const llvm::opt::ArgList &Args);
 
-void AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
-                        const InputInfo &Output, const InputInfoList &Inputs,
-                        const llvm::opt::ArgList &Args,
-                        llvm::opt::ArgStringList &CmdArgs, const JobAction &JA,
-                        const Tool &T);
-
 const char *SplitDebugName(const llvm::opt::ArgList &Args,
                            const InputInfo &Input, const InputInfo &Output);
 

diff  --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 93a285d42a08..619407a7c57b 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -625,10 +625,6 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  // Add HIP offloading linker script args if required.
-  AddHIPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA,
-                     *this);
-
   Args.AddAllArgs(CmdArgs, options::OPT_T);
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());

diff  --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index b9143f98f152..f524951ce837 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -104,6 +104,76 @@ void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
   C.addCommand(std::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
 }
 
+/// Add Generated HIP Object File which has device images embedded into the
+/// host to the argument list for linking. Using MC directives, embed the
+/// device code and also define symbols required by the code generation so that
+/// the image can be retrieved at runtime.
+void AMDGCN::Linker::constructGenerateObjFileFromHIPFatBinary(
+    Compilation &C, const InputInfo &Output,
+    const InputInfoList &Inputs, const ArgList &Args,
+    const JobAction &JA) const {
+  const ToolChain &TC = getToolChain();
+  std::string Name =
+      std::string(llvm::sys::path::stem(Output.getFilename()));
+
+  // Create Temp Object File Generator,
+  // Offload Bundled file and Bundled Object file.
+  // Keep them if save-temps is enabled.
+  const char *McinFile;
+  const char *BundleFile;
+  if (C.getDriver().isSaveTempsEnabled()) {
+    McinFile = C.getArgs().MakeArgString(Name + ".mcin");
+    BundleFile = C.getArgs().MakeArgString(Name + ".hipfb");
+  } else {
+    auto TmpNameMcin = C.getDriver().GetTemporaryPath(Name, "mcin");
+    McinFile = C.addTempFile(C.getArgs().MakeArgString(TmpNameMcin));
+    auto TmpNameFb = C.getDriver().GetTemporaryPath(Name, "hipfb");
+    BundleFile = C.addTempFile(C.getArgs().MakeArgString(TmpNameFb));
+  }
+  constructHIPFatbinCommand(C, JA, BundleFile, Inputs, Args, *this);
+
+  // Create a buffer to write the contents of the temp obj generator.
+  std::string ObjBuffer;
+  llvm::raw_string_ostream ObjStream(ObjBuffer);
+
+  // Add MC directives to embed target binaries. We ensure that each
+  // section and image is 16-byte aligned. This is not mandatory, but
+  // increases the likelihood of data to be aligned with a cache block
+  // in several main host machines.
+  ObjStream << "#       HIP Object Generator\n";
+  ObjStream << "# *** Automatically generated by Clang ***\n";
+  ObjStream << "  .type __hip_fatbin, at object\n";
+  ObjStream << "  .section .hip_fatbin,\"aMS\", at progbits,1\n";
+  ObjStream << "  .data\n";
+  ObjStream << "  .globl __hip_fatbin\n";
+  ObjStream << "  .p2align 3\n";
+  ObjStream << "__hip_fatbin:\n";
+  ObjStream << "  .incbin \"" << BundleFile << "\"\n";
+  ObjStream.flush();
+
+  // Dump the contents of the temp object file gen if the user requested that.
+  // We support this option to enable testing of behavior with -###.
+  if (C.getArgs().hasArg(options::OPT_fhip_dump_offload_linker_script))
+    llvm::errs() << ObjBuffer;
+
+  // Open script file and write the contents.
+  std::error_code EC;
+  llvm::raw_fd_ostream Objf(McinFile, EC, llvm::sys::fs::OF_None);
+
+  if (EC) {
+    C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
+    return;
+  }
+
+  Objf << ObjBuffer;
+
+  ArgStringList McArgs{"-triple", Args.MakeArgString(TC.getTripleString()),
+                       "-o",      Output.getFilename(),
+                       McinFile,  "--filetype=obj"};
+  const char *Mc = Args.MakeArgString(TC.GetProgramPath("llvm-mc"));
+  C.addCommand(std::make_unique<Command>(JA, *this, Mc, McArgs, Inputs));
+}
+
 // For amdgcn the inputs of the linker job are device bitcode and output is
 // object file. It calls llvm-link, opt, llc, then lld steps.
 void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -111,6 +181,10 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
+  if (Inputs.size() > 0 &&
+      Inputs[0].getType() == types::TY_Image &&
+      JA.getType() == types::TY_Object)
+    return constructGenerateObjFileFromHIPFatBinary(C, Output, Inputs, Args, JA);
 
   if (JA.getType() == types::TY_HIP_FATBIN)
     return constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this);

diff  --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h
index fa6984ed9f52..5e2be7138579 100644
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h
@@ -42,6 +42,13 @@ class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
   void constructLldCommand(Compilation &C, const JobAction &JA,
                            const InputInfoList &Inputs, const InputInfo &Output,
                            const llvm::opt::ArgList &Args) const;
+
+  // Construct command for creating Object from HIP fatbin.
+  void constructGenerateObjFileFromHIPFatBinary(Compilation &C,
+                                                const InputInfo &Output,
+                                                const InputInfoList &Inputs,
+                                                const llvm::opt::ArgList &Args,
+                                                const JobAction &JA) const;
 };
 
 } // end namespace AMDGCN

diff  --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 53fe9af0e653..3d8393022346 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -25,12 +25,15 @@
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -fgpu-rdc %t.o\
 // RUN: 2>&1 | FileCheck %s
 
-// CHECK: # "amdgcn-amd-amdhsa" - "offload bundler", inputs: ["[[IN:.*o]]"], outputs: ["[[OBJ1:.*o]]", "[[OBJ2:.*o]]", "[[OBJ3:.*o]]"] 
-// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ2]]"], output: "[[IMG2:.*out]]"
+// CHECK: # "x86_64-unknown-linux-gnu" - "offload bundler", inputs: ["[[IN:.*o]]"], outputs: ["[[HOSTOBJ:.*o]]", "{{.*o}}", "{{.*o}}"]
+// CHECK: # "amdgcn-amd-amdhsa" - "offload bundler", inputs: ["[[IN]]"], outputs: ["{{.*o}}", "[[DOBJ1:.*o]]", "[[DOBJ2:.*o]]"]
+// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[DOBJ1]]"], output: "[[IMG1:.*out]]"
 // CHECK-NOT: offload bundler
-// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ3]]"], output: "[[IMG3:.*out]]"
+// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[DOBJ2]]"], output: "[[IMG2:.*out]]"
 // CHECK-NOT: offload bundler
-// CHECK: # "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["[[OBJ1]]", "[[IMG2]]", "[[IMG3]]"], output: "a.out"
+// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[IMG1]]", "[[IMG2]]"], output: "[[FATBINOBJ:.*o]]"
+// CHECK-NOT: offload bundler
+// CHECK: # "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["[[HOSTOBJ]]", "[[FATBINOBJ]]"], output: "a.out"
 
 // RUN: %clang --hip-link -ccc-print-bindings -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %t.o\

diff  --git a/clang/test/Driver/hip-link-save-temps.hip b/clang/test/Driver/hip-link-save-temps.hip
index 304be921ebc6..62d8d60af3c0 100644
--- a/clang/test/Driver/hip-link-save-temps.hip
+++ b/clang/test/Driver/hip-link-save-temps.hip
@@ -27,7 +27,7 @@
 // CHECK-SAME: "-o" "a.out-hip-amdgcn-amd-amdhsa-gfx900" "obj1-hip-amdgcn-amd-amdhsa-gfx900.o" "obj2-hip-amdgcn-amd-amdhsa-gfx900.o"
 // CHECK: "{{.*lld.*}}" {{.*}} "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-o" "a.out-hip-amdgcn-amd-amdhsa-gfx906" "obj1-hip-amdgcn-amd-amdhsa-gfx906.o" "obj2-hip-amdgcn-amd-amdhsa-gfx906.o"
-// OUT: "{{.*clang-offload-bundler.*}}" {{.*}} "-outputs=executable.hipfb"
-// OUT: "{{.*ld.*}}" {{.*}} "-o" "executable" {{.*}} "-T" "executable.lk"
-// NOUT: "{{.*clang-offload-bundler.*}}" {{.*}} "-outputs=a.out.hipfb"
-// NOUT: "{{.*ld.*}}" {{.*}} "-o" "a.out" {{.*}} "-T" "a.out.lk"
+// CHECK: {{".*llvm-mc.*"}} "-triple" "amdgcn-amd-amdhsa" "-o"
+// CHECK-SAME: "[[OBJBUNDLE:.*.o]]" "{{.*}}.mcin" "--filetype=obj"
+// OUT: "{{.*ld.*}}" {{.*}} "-o" "executable" {{.*}} "[[OBJBUNDLE]]"
+// NOUT: "{{.*ld.*}}" {{.*}} "-o" "a.out" {{.*}} "[[OBJBUNDLE]]"

diff  --git a/clang/test/Driver/hip-link-shared-library.hip b/clang/test/Driver/hip-link-shared-library.hip
index cb409d1a874d..895bd53225e1 100644
--- a/clang/test/Driver/hip-link-shared-library.hip
+++ b/clang/test/Driver/hip-link-shared-library.hip
@@ -3,10 +3,13 @@
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %t.o %S/Inputs/in.so \
 // RUN:   -fgpu-rdc 2>&1 | FileCheck %s
 
-// CHECK: # "amdgcn-amd-amdhsa" - "offload bundler", inputs: ["[[IN:.*o]]"], outputs: ["[[OBJ1:.*o]]", "[[OBJ2:.*o]]", "[[OBJ3:.*o]]"]
-// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ2]]"], output: "[[IMG2:.*out]]"
+// CHECK: # "x86_64-unknown-linux-gnu" - "offload bundler", inputs: ["[[IN:.*o]]"], outputs: ["[[HOSTOBJ:.*o]]", "{{.*o}}", "{{.*o}}"]
+// CHECK: # "amdgcn-amd-amdhsa" - "offload bundler", inputs: ["[[IN]]"], outputs: ["{{.*o}}", "[[DOBJ1:.*o]]", "[[DOBJ2:.*o]]"]
+// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[DOBJ1]]"], output: "[[IMG1:.*out]]"
 // CHECK-NOT: offload bundler
-// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ3]]"], output: "[[IMG3:.*out]]"
+// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[DOBJ2]]"], output: "[[IMG2:.*out]]"
 // CHECK-NOT: offload bundler
-// CHECK: # "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["[[OBJ1]]", "{{.*}}/Inputs/in.so", "[[IMG2]]", "[[IMG3]]"], output: "a.out"
+// CHECK: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[IMG1]]", "[[IMG2]]"], output: "[[FATBINOBJ:.*o]]"
+// CHECK-NOT: offload bundler
+// CHECK: # "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["[[HOSTOBJ]]", "{{.*}}/Inputs/in.so", "[[FATBINOBJ]]"], output: "a.out"
 

diff  --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index b27ae44167a2..7c2dc1384ccd 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -21,25 +21,27 @@
 // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 
 // BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
 // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
 // NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
 // NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
 // NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// RDC-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P6]]}, image, (device-[[T]], [[ARCH]])
+// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
 
-// NRD-DAG: [[P12:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NRD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
 
-// NRD-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-[[T]])
-// RDC-DAG: [[P13:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// BIN-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (host-[[T]])
-// BIN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image, (host-[[T]])
-// RDC-DAG: [[P16:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P15]]}, "device-[[T]] (amdgcn-amd-amdhsa:gfx803)" {[[P10]]}, image
 //
 // Test single gpu architecture up to the assemble phase.
 //
@@ -56,59 +58,84 @@
 // ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-[[T]])
 
 //
-// Test two gpu architectures with complete compilation.
+// Test two gpu architectures with complete compilation with -fno-gpu-rdc.
 //
 // RUN: %clang -x hip -target x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN2,NRD2,CL2 %s
+// RUN: | FileCheck -check-prefixes=NRD2,NCL2 %s
 
 // RUN: %clang -x hip -target x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -c 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN2,NRD2 %s
+// RUN: | FileCheck -check-prefixes=NRD2 %s
 
+// NRD2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
+// NRD2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// NRD2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+
+// NRD2-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH1:gfx803]])
+// NRD2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
+// NRD2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH1]])
+// NRD2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH1]])
+// NRD2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH1]])
+// NRD2-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH1]])
+// NRD2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH1]])" {[[P8]]}, image
+
+// NRD2-DAG: [[P10:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]])
+// NRD2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// NRD2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-[[T]], [[ARCH2]])
+// NRD2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-[[T]], [[ARCH2]])
+// NRD2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-[[T]], [[ARCH2]])
+// NRD2-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image, (device-[[T]], [[ARCH2]])
+// NRD2-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P15]]}, image
+// NRD2-DAG: [[P17:[0-9]+]]: linker, {[[P9]], [[P16]]}, hip-fatbin, (device-[[T]])
+// NRD2-DAG: [[P18:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P17]]}, ir
+// NRD2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-[[T]])
+// NRD2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]])
+// NCL2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-[[T]])
+
+//
+// Test two gpu architectures with complete compilation with -fgpu-rdc.
+//
 // RUN: %clang -x hip -target x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -fgpu-rdc 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN2,RDC2,CL2,RCL2 %s
+// RUN: | FileCheck -check-prefixes=RDC2,CL2,RCL2 %s
 
 // RUN: %clang -x hip -target x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s -fgpu-rdc -c 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN2,RDC2,RC2 %s
+// RUN: | FileCheck -check-prefixes=RDC2,RC2 %s
 
-// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
-// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
-// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// RCL2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
+// RCL2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// RCL2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// RCL2-DAG: [[P19:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// RCL2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]])
 
-// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH1:gfx803]])
-// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
-// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH1]])
-// NRD2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH1]])
+// RDC2-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH1:gfx803]])
+// RDC2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH1]])
+// RDC2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH1]])
 // RDC2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH1]])
-// NRD2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH1]])
-// NRD2-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH1]])
 // RCL2-DAG: [[P8:[0-9]+]]: linker, {[[P6]]}, image, (device-[[T]], [[ARCH1]])
-// NRD2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH1]])" {[[P8]]}, image
+// RCL2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH1]])" {[[P8]]}, image
 // RC2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH1]])" {[[P6]]}, ir
 
-// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]])
-// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
-// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-[[T]], [[ARCH2]])
-// NRD2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-[[T]], [[ARCH2]])
+// RDC2-DAG: [[P10:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]])
+// RDC2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// RDC2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-[[T]], [[ARCH2]])
 // RDC2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, ir, (device-[[T]], [[ARCH2]])
-// NRD2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-[[T]], [[ARCH2]])
-// NRD2-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image, (device-[[T]], [[ARCH2]])
 // RCL2-DAG: [[P15:[0-9]+]]: linker, {[[P13]]}, image, (device-[[T]], [[ARCH2]])
-// NRD2-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P15]]}, image
+// RCL2-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P15]]}, image
 // RC2-DAG: [[P16:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P13]]}, ir
 
-// NRD2-DAG: [[P17:[0-9]+]]: linker, {[[P9]], [[P16]]}, hip-fatbin, (device-[[T]])
+// RC2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
+// RC2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
+// RC2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
+// RC2-DAG: [[P19:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// RC2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]])
 
-// NRD2-DAG: [[P18:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P17]]}, ir
-// NRD2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-[[T]])
-// RDC2-DAG: [[P19:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-[[T]])
-// CL2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-[[T]])
-// RCL2-DAG: [[P22:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P21]]}, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH1]])" {[[P8]]}, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P15]]}, image
-// RC2-DAG: [[P22:[0-9]+]]: clang-offload-bundler, {[[P9]], [[P16]], [[P20]]}, object, (host-[[T]])
+// RCL2-DAG: [[P17:[0-9]+]]: linker, {[[P9]], [[P16]]}, object, (device-[[T]])
+// RCL2-DAG: [[P22:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P17]]}, object
+// RCL2-DAG: [[P23:[0-9]+]]: linker, {[[P20]], [[P22]]}, image, (host-[[T]])
+// RC2-DAG: [[P23:[0-9]+]]: clang-offload-bundler, {[[P9]], [[P16]], [[P20]]}, object, (host-[[T]])
 
 //
 // Test two gpu architecturess up to the assemble phase.
@@ -253,8 +280,13 @@
 // RL2-DAG: [[P1:[0-9]+]]: clang-offload-unbundler, {[[P0]]}, object, (host-[[T]])
 // L2-DAG: [[P2:[0-9]+]]: input, "{{.*}}obj2.o", object, (host-[[T]])
 // RL2-DAG: [[P3:[0-9]+]]: clang-offload-unbundler, {[[P2]]}, object, (host-[[T]])
-// NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T]])
-// RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]]}, image, (host-[[T]])
-// RL2-DAG: [[P5:[0-9]+]]: linker, {[[P1]], [[P3]]}, image, (device-[[T]], [[ARCH1:gfx803]])
+
+// RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]]}, image, (device-[[T]], [[ARCH1:gfx803]])
+// RL2-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH1]])" {[[P4]]}, image
 // RL2-DAG: [[P6:[0-9]+]]: linker, {[[P1]], [[P3]]}, image, (device-[[T]], [[ARCH2:gfx900]])
-// RL2-DAG: [[P7:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P4]]}, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH1]])" {[[P5]]}, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P6]]}, image
+// RL2-DAG: [[P7:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P6]]}, image
+// RL2-DAG: [[P8:[0-9]+]]: linker, {[[P5]], [[P7]]}, object, (device-[[T]])
+// RL2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, object
+
+// NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T]])
+// RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]], [[P9]]}, image, (host-[[T]])

diff  --git a/clang/test/Driver/hip-save-temps.hip b/clang/test/Driver/hip-save-temps.hip
index 1ac506aa6fba..f326f4155a15 100644
--- a/clang/test/Driver/hip-save-temps.hip
+++ b/clang/test/Driver/hip-save-temps.hip
@@ -25,13 +25,20 @@
 // -fgpu-rdc without -o
 // RUN: %clang -### -target x86_64-linux-gnu -nogpulib -save-temps \
 // RUN:   -fgpu-rdc --cuda-gpu-arch=gfx900 %s 2>&1 | \
-// RUN:   FileCheck -check-prefixes=CHECK,RDC,RDCL,RDC-NOUT,NOUT %s
+// RUN:   FileCheck -check-prefixes=CHECK,RDC,RDCL,NOUT %s
 
 // -fgpu-rdc with -o
-// RUN: %clang -### -target x86_64-linux-gnu -nogpulib -save-temps \
-// RUN:   -o executable -fgpu-rdc --cuda-gpu-arch=gfx900 %s 2>&1 | \
-// RUN:   FileCheck -check-prefixes=CHECK,RDC,RDCL,RDC-WOUT,WOUT %s
+// UN: %clang -### -target x86_64-linux-gnu -nogpulib -save-temps \
+// UN:   -o executable -fgpu-rdc --cuda-gpu-arch=gfx900 %s 2>&1 | \
+// UN:   FileCheck -check-prefixes=CHECK,RDC,RDCL,WOUT %s
+
+// -fgpu-rdc host object path
+// RDCL: "{{.*clang.*}}" "-cc1" {{.*}} "-E" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.cui"
+// RDCL: "{{.*clang.*}}" "-cc1" {{.*}} "-emit-llvm-bc" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.bc"
+// RDCL: "{{.*clang.*}}" "-cc1" {{.*}} "-S" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.s"
+// RDCL: "{{.*clang.*}}" "-cc1as" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.o"
 
+// device object paths
 // CHECK: {{".*clang.*"}} "-cc1" {{.*}} "-E" {{.*}} [[CPU:"-target-cpu" "gfx900"]] {{.*}}  "-o" "hip-save-temps-hip-amdgcn-amd-amdhsa-gfx900.cui"
 // NORDC: {{".*clang.*"}} "-cc1" {{.*}} "-emit-llvm-bc" {{.*}} [[CPU]] {{.*}} "-disable-llvm-passes" {{.*}} "-o" "hip-save-temps-hip-amdgcn-amd-amdhsa-gfx900.bc"
 // RDC: {{".*clang.*"}} "-cc1" {{.*}} "-emit-llvm-bc" {{.*}} [[CPU]] {{.*}} "-disable-llvm-passes" {{.*}} "-o" "hip-save-temps-hip-amdgcn-amd-amdhsa-gfx900.tmp.bc"
@@ -43,22 +50,26 @@
 
 // RDC: {{".*clang.*"}} "-cc1" {{.*}} "-emit-llvm-bc" {{.*}} [[CPU]] {{.*}} "-o" "hip-save-temps-hip-amdgcn-amd-amdhsa-gfx900.bc"
 // NORDC: {{".*clang.*"}} "-cc1as" {{.*}} "-filetype" "obj" {{.*}} [[CPU]] {{.*}} "-o" "hip-save-temps-hip-amdgcn-amd-amdhsa-gfx900.o"
-// CHECK-NOT: llvm-link
-// CHECK-NOT: opt
-// CHECK-NOT: llc
+// CHECK-NOT: "{{.*}}llvm-link"
+// CHECK-NOT: "{{.*}}opt"
+// CHECK-NOT: "{{.*}}llc"
 // NORDC: {{.*lld.*}}"-o" "hip-save-temps-hip-amdgcn-amd-amdhsa-gfx900.out"
-
 // RDCL: "{{.*lld.*}}" {{.*}} "-mllvm" "-amdgpu-internalize-symbols"
 // RDCL-SAME: "-o" "a.out-hip-amdgcn-amd-amdhsa-gfx900"
+// RDCC: "{{.*clang.*}}" "-cc1" {{.*}} "-E" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.cui"
+// RDCC: "{{.*clang.*}}" "-cc1" {{.*}} "-emit-llvm-bc" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.bc"
+// RDCC: "{{.*clang.*}}" "-cc1" {{.*}} "-S" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.s"
+// RDCC: "{{.*clang.*}}" "-cc1as" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.o"
+// RDCC: "{{.*clang-offload-bundler.*}}" {{.*}} "-outputs=hip-save-temps.o"
+// RDCL: "{{.*clang-offload-bundler.*}}" {{.*}} "-outputs=hip-save-temps-hip-amdgcn-amd-amdhsa.hipfb"
+// RDCL: {{.*}}llvm-mc{{.*}}"-o" "hip-save-temps-hip-amdgcn-amd-amdhsa.o" "hip-save-temps-hip-amdgcn-amd-amdhsa.mcin" "--filetype=obj"
 
-// NORDC: "{{.*clang-offload-bundler.*}}" {{.*}} "-outputs=hip-save-temps.hip-hip-amdgcn-amd-amdhsa.hipfb"
-// CHECK: "{{.*clang.*}}" "-cc1" {{.*}} "-E" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.cui"
+// -fno-gpu-rdc host object path
+// NORDC: "{{.*clang.*}}" "-cc1" {{.*}} "-E" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.cui"
 // NORDC: "{{.*clang.*}}" "-cc1" {{.*}} "-emit-llvm-bc" {{.*}}  "-fcuda-include-gpubinary" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.bc"
-// RDC: "{{.*clang.*}}" "-cc1" {{.*}} "-emit-llvm-bc" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.bc"
-// CHECK: "{{.*clang.*}}" "-cc1" {{.*}} "-S" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.s"
-// CHECK: "{{.*clang.*}}" "-cc1as" {{.*}} "-o" "hip-save-temps{{.*}}.o"
-// RDCC: "{{.*clang-offload-bundler.*}}" {{.*}} "-outputs=hip-save-temps.o"
-// RDC-NOUT: "{{.*clang-offload-bundler.*}}" {{.*}} "-outputs=a.out.hipfb"
-// RDC-WOUT: "{{.*clang-offload-bundler.*}}" {{.*}} "-outputs=executable.hipfb"
-// NOUT: "{{.*ld.*}}" {{.*}} "-o" "a.out"
-// WOUT: "{{.*ld.*}}" {{.*}} "-o" "executable"
\ No newline at end of file
+// NORDC: "{{.*clang.*}}" "-cc1" {{.*}} "-S" {{.*}} "-o" "hip-save-temps-host-x86_64-unknown-linux-gnu.s"
+// NORDC: "{{.*clang.*}}" "-cc1as" {{.*}} "-o" "hip-save-temps{{.*}}.o"
+
+// output to default a.out or -o specified file name
+// NOUT: {{.*}}ld{{.*}}"-o" "a.out"
+// WOUT: {{.*}}ld{{.*}}"-o" "executable"

diff  --git a/clang/test/Driver/hip-toolchain-rdc-separate.hip b/clang/test/Driver/hip-toolchain-rdc-separate.hip
index 25661f399a82..25fd44f2529b 100644
--- a/clang/test/Driver/hip-toolchain-rdc-separate.hip
+++ b/clang/test/Driver/hip-toolchain-rdc-separate.hip
@@ -86,12 +86,22 @@
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
 // LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// LINK-SAME: "-inputs=[[A_O:.*a.o]]" "-outputs=[[A_OBJ_HOST:.*o]],[[A_BC1:.*o]],[[A_BC2:.*o]]"
+// LINK-SAME: "-inputs=[[A_O:.*a.o]]" "-outputs=[[A_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
 // LINK: "-unbundle"
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
 // LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// LINK-SAME: "-inputs=[[B_O:.*b.o]]" "-outputs=[[B_OBJ_HOST:.*o]],[[B_BC1:.*o]],[[B_BC2:.*o]]"
+// LINK-SAME: "-inputs=[[B_O:.*b.o]]" "-outputs=[[B_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
+// LINK: "-unbundle"
+
+// LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// LINK-SAME: "-inputs=[[A_O]]" "-outputs={{.*o}},[[A_BC1:.*o]],[[A_BC2:.*o]]"
+// LINK: "-unbundle"
+
+// LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// LINK-SAME: "-inputs=[[B_O]]" "-outputs={{.*o}},[[B_BC1:.*o]],[[B_BC2:.*o]]"
 // LINK: "-unbundle"
 
 // LINK-NOT: "*.llvm-link"
@@ -110,5 +120,8 @@
 // LINK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // LINK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
 
-// LINK: [[LD:".*ld.*"]] {{.*}} "[[A_OBJ_HOST]]" "[[B_OBJ_HOST]]"
-// LINK-SAME: {{.*}} "-T" "{{.*}}.lk"
+// LINK: {{".*llvm-mc.*"}} "-triple" "amdgcn-amd-amdhsa" "-o"
+// LINK-SAME: "[[OBJBUNDLE:.*o]]" "{{.*}}.mcin" "--filetype=obj"
+
+// LINK: [[LD:".*ld.*"]] {{.*}} "-o" "a.out" {{.*}} "[[A_OBJ_HOST]]"
+// LINK-SAME: "[[B_OBJ_HOST]]" "[[OBJBUNDLE]]"

diff  --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip
index f520236abcb9..7148478b4285 100644
--- a/clang/test/Driver/hip-toolchain-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-rdc.hip
@@ -12,7 +12,23 @@
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s
 
-// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// emit objects for host side path
+// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
+
+// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
+
+// generate image for device side path on gfx803
+// CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
@@ -21,7 +37,7 @@
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
 // CHECK-SAME: {{.*}} "-o" [[A_BC1:".*bc"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
+// CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
@@ -32,7 +48,7 @@
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
 // CHECK-SAME: {{.*}} "-o" [[B_BC1:".*bc"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
+// CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK-NOT: "*.llvm-link"
 // CHECK-NOT: ".*opt"
@@ -40,6 +56,7 @@
 // CHECK: {{".*lld.*"}} {{.*}} "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-o" "[[IMG_DEV1:.*.out]]" [[A_BC1]] [[B_BC1]]
 
+// generate image for device side path on gfx900
 // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-emit-llvm-bc"
@@ -66,23 +83,13 @@
 // CHECK: {{".*lld.*"}} {{.*}} "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-o" "[[IMG_DEV2:.*.out]]" [[A_BC2]] [[B_BC2]]
 
-// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
-// CHECK-SAME: "-emit-obj"
-// CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[A_SRC]]
-
-// CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
-// CHECK-SAME: "-emit-obj"
-// CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
-// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
-// CHECK-SAME: {{.*}} [[B_SRC]]
-
+// combine images generated into hip fat binary object
 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
 // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
 
-// CHECK: [[LD:".*ld.*"]] {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]]
-// CHECK-SAME: {{.*}} "-T" "{{.*}}.lk"
+// CHECK: [[MC:".*llvm-mc"]] "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-o" [[OBJBUNDLE:".*o"]] "{{.*}}.mcin" "--filetype=obj"
+
+// output the executable
+// CHECK: [[LD:".*ld.*"]] {{.*}}"-o" "a.out" {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]] [[OBJBUNDLE]]


        


More information about the cfe-commits mailing list