[clang] 5fc2673 - [HIP] Add --gpu-bundle-output

Wed Jun 9 20:54:47 PDT 2021

Author: Yaxun (Sam) Liu
Date: 2021-06-09T23:31:43-04:00
New Revision: 5fc2673fbce247e107094b28c22cbb2d5f1691a8

URL: https://github.com/llvm/llvm-project/commit/5fc2673fbce247e107094b28c22cbb2d5f1691a8
DIFF: https://github.com/llvm/llvm-project/commit/5fc2673fbce247e107094b28c22cbb2d5f1691a8.diff

LOG: [HIP] Add --gpu-bundle-output

Added --gpu-bundle-output to control bundling/unbundling output of HIP device compilation.

By default preprocessor expansion, llvm bitcode and assembly are unbundled, code objects are
bundled.

Reviewed by: Artem Belevich, Jan Svoboda

Differential Revision: https://reviews.llvm.org/D101630

Added: 
    

Modified: 
    clang/include/clang/Driver/Options.td
    clang/lib/Driver/Driver.cpp
    clang/test/Driver/clang-offload-bundler.c
    clang/test/Driver/hip-device-compile.hip
    clang/test/Driver/hip-output-file-name.hip
    clang/test/Driver/hip-phases.hip
    clang/test/Driver/hip-rdc-device-only.hip
    clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 55391cf2dac1d..7dcee76b4ed80 100644

--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -988,6 +988,10 @@ def gpu_instrument_lib_EQ : Joined<["--"], "gpu-instrument-lib=">,
 def fgpu_sanitize : Flag<["-"], "fgpu-sanitize">, Group<f_Group>,
   HelpText<"Enable sanitizer for AMDGPU target">;
 def fno_gpu_sanitize : Flag<["-"], "fno-gpu-sanitize">, Group<f_Group>;
+def gpu_bundle_output : Flag<["--"], "gpu-bundle-output">,
+  Group<f_Group>, HelpText<"Bundle output files of HIP device compilation">;
+def no_gpu_bundle_output : Flag<["--"], "no-gpu-bundle-output">,
+  Group<f_Group>, HelpText<"Do not bundle output files of HIP device compilation">;
 def cuid_EQ : Joined<["-"], "cuid=">, Flags<[CC1Option]>,
   HelpText<"An ID for compilation unit, which should be the same for the same "
            "compilation unit but 
diff erent for 
diff erent compilation units. "

diff  --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index cd2c8c9b19165..930941fe8558c 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2907,6 +2907,12 @@ class OffloadingActionBuilder final {
     /// The linker inputs obtained for each device arch.
     SmallVector<ActionList, 8> DeviceLinkerInputs;
     bool GPUSanitize;
+    // The default bundling behavior depends on the type of output, therefore
+    // BundleOutput needs to be tri-value: None, true, or false.
+    // Bundle code objects except --no-gpu-output is specified for device
+    // only compilation. Bundle other type of output files only if
+    // --gpu-bundle-output is specified for device only compilation.
+    Optional<bool> BundleOutput;
 
   public:
     HIPActionBuilder(Compilation &C, DerivedArgList &Args,
@@ -2915,6 +2921,10 @@ class OffloadingActionBuilder final {
       DefaultCudaArch = CudaArch::GFX803;
       GPUSanitize = Args.hasFlag(options::OPT_fgpu_sanitize,
                                  options::OPT_fno_gpu_sanitize, false);
+      if (Args.hasArg(options::OPT_gpu_bundle_output,
+                      options::OPT_no_gpu_bundle_output))
+        BundleOutput = Args.hasFlag(options::OPT_gpu_bundle_output,
+                                    options::OPT_no_gpu_bundle_output);
     }
 
     bool canUseBundlerUnbundler() const override { return true; }
@@ -3004,22 +3014,25 @@ class OffloadingActionBuilder final {
           CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
               DDep, CudaDeviceActions[I]->getType());
         }
-        // Create HIP fat binary with a special "link" action.
-        CudaFatBinary =
-            C.MakeAction<LinkJobAction>(CudaDeviceActions,
-                types::TY_HIP_FATBIN);
 
-        if (!CompileDeviceOnly) {
-          DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
-                 AssociatedOffloadKind);
-          // Clear the fat binary, it is already a dependence to an host
-          // action.
-          CudaFatBinary = nullptr;
-        }
+        if (!CompileDeviceOnly || !BundleOutput.hasValue() ||
+            BundleOutput.getValue()) {
+          // Create HIP fat binary with a special "link" action.
+          CudaFatBinary = C.MakeAction<LinkJobAction>(CudaDeviceActions,
+                                                      types::TY_HIP_FATBIN);
 
-        // Remove the CUDA actions as they are already connected to an host
-        // action or fat binary.
-        CudaDeviceActions.clear();
+          if (!CompileDeviceOnly) {
+            DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+                   AssociatedOffloadKind);
+            // Clear the fat binary, it is already a dependence to an host
+            // action.
+            CudaFatBinary = nullptr;
+          }
+
+          // Remove the CUDA actions as they are already connected to an host
+          // action or fat binary.
+          CudaDeviceActions.clear();
+        }
 
         return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
       } else if (CurPhase == phases::Link) {
@@ -3046,6 +3059,20 @@ class OffloadingActionBuilder final {
         A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
                                                AssociatedOffloadKind);
 
+      if (CompileDeviceOnly && CurPhase == FinalPhase &&
+          BundleOutput.hasValue() && BundleOutput.getValue()) {
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          OffloadAction::DeviceDependences DDep;
+          DDep.add(*CudaDeviceActions[I], *ToolChains.front(), GpuArchList[I],
+                   AssociatedOffloadKind);
+          CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+              DDep, CudaDeviceActions[I]->getType());
+        }
+        CudaFatBinary =
+            C.MakeAction<OffloadBundlingJobAction>(CudaDeviceActions);
+        CudaDeviceActions.clear();
+      }
+
       return (CompileDeviceOnly && CurPhase == FinalPhase) ? ABRT_Ignore_Host
                                                            : ABRT_Success;
     }

diff  --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c
index 221aaeb6316d8..faa6c5161a8f9 100644
--- a/clang/test/Driver/clang-offload-bundler.c
+++ b/clang/test/Driver/clang-offload-bundler.c
@@ -361,6 +361,21 @@
 // CKLST2-NOT: openmp-powerpc64le-ibm-linux-gnu
 // CKLST2-NOT: openmp-x86_64-pc-linux-gnu
 
+//
+// Check bundling without host target is allowed for HIP.
+//
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \
+// RUN:   -inputs=%t.tgt1,%t.tgt2 -outputs=%t.hip.bundle.bc
+// RUN: clang-offload-bundler -type=bc -list -inputs=%t.hip.bundle.bc | FileCheck -check-prefix=NOHOST %s
+// RUN: clang-offload-bundler -type=bc -targets=hip-amdgcn-amd-amdhsa-gfx900,hip-amdgcn-amd-amdhsa-gfx906 \
+// RUN:   -outputs=%t.res.tgt1,%t.res.tgt2 -inputs=%t.hip.bundle.bc -unbundle
+// RUN: 
diff  %t.tgt1 %t.res.tgt1
+// RUN: 
diff  %t.tgt2 %t.res.tgt2
+//
+// NOHOST-NOT: host-
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx900
+// NOHOST-DAG: hip-amdgcn-amd-amdhsa-gfx906
+
 // Some code so that we can create a binary out of this file.
 int A = 0;
 void test_func(void) {

diff  --git a/clang/test/Driver/hip-device-compile.hip b/clang/test/Driver/hip-device-compile.hip
index d7f63d2d68355..8711a1fadfc18 100644
--- a/clang/test/Driver/hip-device-compile.hip
+++ b/clang/test/Driver/hip-device-compile.hip
@@ -3,28 +3,56 @@
 // REQUIRES: amdgpu-registered-target
 
 // If -emit-llvm and/or -S is used in device only compilation,
-// the output should not be bundled.
+// the output should not be bundled, except --gpu-bundle-output
+// is specified.
 
+// Output unbundled bitcode.
 // RUN: %clang -c -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.bc -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.bc -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,BC %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,BC,NBUN %s
 
+// Output bundled bitcode.
+// RUN: %clang -c -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \
+// RUN:   -o a.bc -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \
+// RUN:   --hip-device-lib=lib1.bc \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu --gpu-bundle-output \
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,BCBUN %s
+
+// Output unbundled LLVM IR.
 // RUN: %clang -c -S -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.ll -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.ll -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LL %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LL,NBUN %s
+
+// Output bundled LLVM IR.
+// RUN: %clang -c -S -emit-llvm --cuda-device-only -### -target x86_64-linux-gnu \
+// RUN:   -o a.ll -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \
+// RUN:   --hip-device-lib=lib1.bc \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu --gpu-bundle-output \
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LLBUN %s
 
+// Output unbundled assembly.
 // RUN: %clang -c -S --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.s -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.s -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASM %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASM,NBUN %s
+
+// Output bundled assembly.
+// RUN: %clang -c -S --cuda-device-only -### -target x86_64-linux-gnu \
+// RUN:   -o a.s -x hip --cuda-gpu-arch=gfx900 --no-gpu-bundle-output \
+// RUN:   --hip-device-lib=lib1.bc \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu --gpu-bundle-output \
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,ASMBUN %s
 
 // CHECK: {{".*clang.*"}} "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
@@ -36,38 +64,64 @@
 // CHECK-SAME: {{".*lib1.bc"}}
 // CHECK-SAME: "-target-cpu" "gfx900"
 // BC-SAME: "-o" "a.bc"
+// BCBUN-SAME: "-o" "{{.*}}.bc"
 // LL-SAME: "-o" "a.ll"
+// LLBUN-SAME: "-o" "{{.*}}.ll"
 // ASM-SAME: "-o" "a.s"
+// ASMBUN-SAME: "-o" "{{.*}}.s"
 // CHECK-SAME: {{".*a.cu"}}
 
 // CHECK-NOT: {{"*.llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 // CHECK-NOT: {{".*lld.*"}}
-// CHECK-NOT: {{".*clang-offload-bundler"}}
+// NBUN-NOT: {{".*clang-offload-bundler"}}
+// BCBUN: {{".*clang-offload-bundler"}}{{.*}}"-outputs=a.bc"
+// LLBUN: {{".*clang-offload-bundler"}}{{.*}}"-outputs=a.ll"
+// ASMBUN: {{".*clang-offload-bundler"}}{{.*}}"-outputs=a.s"
 // CHECK-NOT: {{".*ld.*"}}
 
 // If neither -emit-llvm nor -S is used in device only compilation,
-// the output should be bundled.
+// the output should be bundled except --no-gpu-bundle-output is
+// specified.
 
+// Output bundled code objects.
 // RUN: %clang -c --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.s -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.o -x hip --cuda-gpu-arch=gfx900 \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN: 2>&1 | FileCheck -check-prefixes=BUNDLE %s
+// RUN: 2>&1 | FileCheck -check-prefixes=OBJ,OBJ-BUN %s
 
+// Output unbundled code objects.
+// RUN: %clang -c --cuda-device-only -### -target x86_64-linux-gnu \
+// RUN:   -o a.o -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   --hip-device-lib=lib1.bc \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu --no-gpu-bundle-output \
+// RUN: 2>&1 | FileCheck -check-prefixes=OBJ,OBJ-UBUN %s
+
+// Output bundled code objects.
 // RUN: %clang --cuda-device-only -### -target x86_64-linux-gnu \
-// RUN:   -o a.s -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   -o a.o -x hip --cuda-gpu-arch=gfx900 \
 // RUN:   --hip-device-lib=lib1.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN: 2>&1 | FileCheck -check-prefixes=BUNDLE %s
+// RUN: 2>&1 | FileCheck -check-prefixes=OBJ,OBJ-BUN %s
 
-// BUNDLE: {{"*.clang.*"}} {{.*}} "-emit-obj"
-// BUNDLE-NOT: {{"*.llvm-link"}}
-// BUNDLE-NOT: {{".*opt"}}
-// BUNDLE-NOT: {{".*llc"}}
-// BUNDLE: {{".*lld.*"}}
-// BUNDLE: {{".*clang-offload-bundler"}}
+// Output unbundled code objects.
+// RUN: %clang --cuda-device-only -### -target x86_64-linux-gnu \
+// RUN:   -o a.o -x hip --cuda-gpu-arch=gfx900 \
+// RUN:   --hip-device-lib=lib1.bc \
+// RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu --no-gpu-bundle-output \
+// RUN: 2>&1 | FileCheck -check-prefixes=OBJ,OBJ-UBUN %s
 
+// OBJ: {{"*.clang.*"}} {{.*}} "-emit-obj"
+// OBJ-NOT: {{"*.llvm-link"}}
+// OBJ-NOT: {{".*opt"}}
+// OBJ-NOT: {{".*llc"}}
+// OBJ-BUN: {{".*lld.*"}}{{.*}}"-o" "{{.*}}.o"
+// OBJ-UBUN: {{".*lld.*"}}{{.*}}"-o" "a.o"
+// OBJ-BUN: {{".*clang-offload-bundler"}}{{.*}}"-outputs=a.o"
+// OBJ-UBUN-NOT: {{".*clang-offload-bundler"}}

diff  --git a/clang/test/Driver/hip-output-file-name.hip b/clang/test/Driver/hip-output-file-name.hip
index b0b1a9d7ff3d2..2957b9f12c756 100644
--- a/clang/test/Driver/hip-output-file-name.hip
+++ b/clang/test/Driver/hip-output-file-name.hip
@@ -2,6 +2,7 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
 
+// Output bundled code objects for combined compilation.
 // RUN: %clang -### -c -target x86_64-linux-gnu -fgpu-rdc \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck %s
@@ -9,43 +10,77 @@
 // CHECK: {{.*}}clang-offload-bundler{{.*}}"-outputs=hip-output-file-name.o"
 
 // Check -E default output is "-" (stdout).
+// If there are multiple preprocessor expansion outputs clang-offload-bundler
+// is used to bundle the final output.
 
+// Output bundled PPE for one GPU for mixed compliation.
+// RUN: %clang -### -E -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 %s \
+// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
+
+// Output unbundled PPE for one GPU for device only compilation.
+// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 %s \
+// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s
+
+// Output bundled PPE for two GPUs for mixed compilation.
 // RUN: %clang -### -E -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
 
+// Output bundled PPE for two GPUs for mixed compilation with -save-temps.
 // RUN: %clang -### -E -save-temps -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
 
+// Output unbundled PPE for two GPUs for device only compilation.
 // RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s
 
+// Output bundled PPE for two GPUs for device only compilation with --gpu-bundle-output.
+// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --gpu-bundle-output \
+// RUN: 2>&1 | FileCheck -check-prefixes=DASH %s
+
+// Output unbundled PPE for two GPUs for device only compilation with --no-gpu-bundle-output.
+// RUN: %clang -### -E --cuda-device-only -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s --no-gpu-bundle-output \
+// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s
+
+// Output unbundled PPE for host only compilation.
 // RUN: %clang -### -E --cuda-host-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck -check-prefixes=CLANG-DASH %s
 
+// DASH-NOT: {{.*}}clang{{.*}}"-o" "-"
 // DASH: {{.*}}clang-offload-bundler{{.*}}"-outputs=-"
 // CLANG-DASH: {{.*}}clang{{.*}}"-o" "-"
+// CLANG-DASH-NOT: {{.*}}clang-offload-bundler{{.*}}"-outputs=-"
 
 // Check -E with -o.
 
+// Output bundled PPE for two GPUs for mixed compilation.
 // RUN: %clang -### -E -o test.cui -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck -check-prefixes=OUT %s
 
+// Output bundled PPE for two GPUs for mixed compilation.
 // RUN: %clang -### -E -o test.cui -save-temps -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck -check-prefixes=OUT %s
 
+// Output bundled PPE for two GPUs for device only compilation with --gpu-bundle-output.
 // RUN: %clang -### -E -o test.cui --cuda-device-only -target x86_64-linux-gnu \
-// RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
-// RUN: 2>&1 | FileCheck -check-prefixes=CLANG-OUT %s
+// RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 --gpu-bundle-output %s \
+// RUN: 2>&1 | FileCheck -check-prefixes=OUT %s
 
+// Output unbundled PPE for two GPUs for device only compilation.
 // RUN: %clang -### -E -o test.cui --cuda-host-only -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: 2>&1 | FileCheck -check-prefixes=CLANG-OUT %s
 
+// OUT-NOT: {{.*}}clang{{.*}}"-o" "test.cui"
 // OUT: {{.*}}clang-offload-bundler{{.*}}"-outputs=test.cui"
 // CLANG-OUT: {{.*}}clang{{.*}}"-o" "test.cui"
+// CLANG-OUT-NOT: {{.*}}clang-offload-bundler{{.*}}"-outputs=test.cui"

diff  --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index b4d1f3adebec0..623299b13dd0c 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -231,13 +231,14 @@
 // compilation mode.
 //
 // RUN: %clang -x hip -target x86_64-unknown-linux-gnu -ccc-print-phases \
-// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S 2>&1 \
+// RUN: --cuda-gpu-arch=gfx803 %s --cuda-device-only -S --no-gpu-bundle-output 2>&1 \
 // RUN: | FileCheck -check-prefixes=DASM %s
 // DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
 // DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
 // DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
 // DASM-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, assembler
+// DASM-NOT: clang-offload-bundler
 // DASM-NOT: host
 
 //
@@ -270,8 +271,20 @@
 //
 // RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
 // RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -S -o %t.s 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2-NOBUNDLE %s
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -S -o %t.s --no-gpu-bundle-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2-NOBUNDLE %s
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
 // RUN: --cuda-device-only -S 2>&1 \
-// RUN: | FileCheck -check-prefixes=DASM2 %s
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2-NOBUNDLE %s
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -S --gpu-bundle-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=DASM2,DASM2-BUNDLE %s
 // DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
 // DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
@@ -282,6 +295,8 @@
 // DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
 // DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
 // DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, assembler
+// DASM2-BUNDLE: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, assembler, (device-hip, )
+// DASM2-NOBUNDLE-NOT: clang-offload-bundler, {[[P4]], [[P9]]}, assembler, (device-hip, )
 // DASM2-NOT: host
 
 //
@@ -312,3 +327,117 @@
 
 // NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T]])
 // RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]], [[P9]]}, image, (host-[[T]])
+
+// Test one gpu architectures up to the preprocessor expansion output phase in device-only
+// compilation mode. no bundle.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \
+// RUN: --cuda-device-only -E 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE,PPEN %s
+
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \
+// RUN: --cuda-device-only -E --no-gpu-bundle-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE,PPEN %s
+
+// Test one gpu architectures up to the preprocessor expansion output phase in device-only
+// compilation mode. bundle.
+
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \
+// RUN: --cuda-device-only -E --gpu-bundle-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE,PPEB %s
+
+// Test two gpu architectures up to the preprocessor expansion output phase in device-only
+// compilation mode. no bundle.
+
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -E 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE2,PPE2N %s
+
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -E --no-gpu-bundle-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE2,PPE2N %s
+
+// Test two gpu architectures up to the preprocessor expansion output phase in device-only
+// compilation mode. bundle.
+
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -E --gpu-bundle-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPE2,PPE2B %s
+
+// Test one gpu architectures up to the LLVM IR output phase in device-only
+// compilation mode. no bundle.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 %s \
+// RUN: --cuda-device-only -c -emit-llvm 2>&1 \
+// RUN: | FileCheck -check-prefixes=LLVM %s
+
+// Test two gpu architectures up to the LLVM IR output phase in device-only
+// compilation mode. bundle.
+//
+// RUN: %clang -x hip -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -c -emit-llvm -o %t.bc --gpu-bundle-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=LLVM2 %s
+
+// Test two gpu architectures up to the LLVM IR output phase in device-only
+// compilation mode with bundled preprocessor expansion as input. bundle.
+//
+// RUN: %clang -x hip-cpp-output -target x86_64-unknown-linux-gnu \
+// RUN: -ccc-print-phases --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %s \
+// RUN: --cuda-device-only -c -emit-llvm -o %t.bc --gpu-bundle-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=PPELLVM2 %s
+
+// PPE-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// PPE-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// PPE-DAG: [[P2:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P1]]}, [[T]]-cpp-output
+// PPEB-DAG: [[P3:[0-9]+]]: clang-offload-bundler, {[[P2]]}, [[T]]-cpp-output, (device-hip, )
+// PPEN-NOT: clang-offload-bundler, {{.*}}, [[T]]-cpp-output, (device-hip, )
+// PPE-NOT: host
+
+// PPE2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// PPE2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// PPE2-DAG: [[P2:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P1]]}, [[T]]-cpp-output
+// PPE2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]])
+// PPE2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// PPE2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P6]]}, [[T]]-cpp-output
+// PPE2B-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P2]], [[P9]]}, [[T]]-cpp-output, (device-hip, )
+// PPE2N-NOT: clang-offload-bundler, {{.*}}, [[T]]-cpp-output, (device-hip, )
+// PPE2-NOT: host
+
+// LLVM-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// LLVM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// LLVM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM-NOT: clang-offload-bundler
+// LLVM-NOT: host
+
+// LLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (device-[[T]], [[ARCH:gfx803]])
+// LLVM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
+// LLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]])
+// LLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir
+// LLVM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH2:gfx900]])
+// LLVM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH2]])
+// LLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
+// LLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
+// LLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir
+// LLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, )
+// LLVM2-NOT: host
+
+// PPELLVM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]]-cpp-output
+// PPELLVM2-DAG: [[P1:[0-9]+]]: clang-offload-unbundler, {[[P0]]}, hip-cpp-output
+// PPELLVM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH:gfx803]])
+// PPELLVM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, ir, (device-[[T]], [[ARCH]])
+// PPELLVM2-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P3]]}, ir
+// PPELLVM2-DAG: [[P7:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH2:gfx900]])
+// PPELLVM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, ir, (device-[[T]], [[ARCH2]])
+// PPELLVM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P8]]}, ir
+// PPELLVM2-DAG: [[P10:[0-9]+]]: clang-offload-bundler, {[[P4]], [[P9]]}, ir, (device-hip, )
+// PPELLVM2-NOT: host

diff  --git a/clang/test/Driver/hip-rdc-device-only.hip b/clang/test/Driver/hip-rdc-device-only.hip
index a8c3a3fe9d934..ca8d54ea633e2 100644
--- a/clang/test/Driver/hip-rdc-device-only.hip
+++ b/clang/test/Driver/hip-rdc-device-only.hip
@@ -6,7 +6,7 @@
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -c -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \
 // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s
 
 // With `-emit-llvm`, the output should be the same as the aforementioned line
@@ -16,14 +16,14 @@
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -c -emit-llvm -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \
 // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITBC %s
 
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \
 // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITLL %s
 
 // With `-emit-llvm`, the output should be the same as the aforementioned line
@@ -33,7 +33,7 @@
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -S -emit-llvm -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \
 // RUN: 2>&1 | FileCheck -check-prefixes=COMMON,EMITLL %s
 
 // With `-save-temps`, commane lines for each steps are dumped. For assembly
@@ -44,9 +44,17 @@
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
-// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip --gpu-bundle-output \
 // RUN: 2>&1 | FileCheck -check-prefix=SAVETEMP %s
 
+// Check output one file without bundling cause error.
+
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
+// RUN:   -S -nogpuinc -nogpulib --cuda-device-only -fgpu-rdc \
+// RUN:   %S/Inputs/hip_multiple_inputs/a.cu -o %t.s --no-gpu-bundle-output \
+// RUN: 2>&1 | FileCheck -check-prefix=FAIL %s
+
 // COMMON: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // COMMON-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // EMITBC-SAME: "-emit-llvm-bc"
@@ -56,8 +64,8 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx803"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip"
 // CHECK-SAME: {{.*}} {{".*a.cu"}}
 
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -69,10 +77,14 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx900"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"a.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"a.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*a.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*a.*ll"}} "-x" "hip"
 // COMMON-SAME: {{.*}} {{".*a.cu"}}
 
+// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// COMMON-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
+
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // COMMON-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
 // EMITBC-SAME: "-emit-llvm-bc"
@@ -82,8 +94,8 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx803"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip"
 // COMMON-SAME: {{.*}} {{".*b.hip"}}
 
 // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -95,10 +107,14 @@
 // COMMON-SAME: "-fapply-global-visibility-to-externs"
 // COMMON-SAME: "-target-cpu" "gfx900"
 // COMMON-SAME: "-fgpu-rdc"
-// EMITBC-SAME: {{.*}} "-o" {{"b.*bc"}} "-x" "hip"
-// EMITLL-SAME: {{.*}} "-o" {{"b.*ll"}} "-x" "hip"
+// EMITBC-SAME: {{.*}} "-o" {{".*b.*bc"}} "-x" "hip"
+// EMITLL-SAME: {{.*}} "-o" {{".*b.*ll"}} "-x" "hip"
 // COMMON-SAME: {{.*}} {{".*b.hip"}}
 
+// COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}"
+// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// COMMON-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}"
+
 // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
 // SAVETEMP-SAME: "-E"
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx803"
@@ -125,6 +141,10 @@
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "a.cu" {{.*}} "-target-cpu" "gfx900"
 // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]]
 
+// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// SAVETEMP-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.ll"
+
 // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu"
 // SAVETEMP-SAME: "-E"
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx803"
@@ -150,3 +170,9 @@
 // SAVETEMP-SAME: "-emit-llvm"
 // SAVETEMP-SAME: {{.*}} "-main-file-name" "b.hip" {{.*}} "-target-cpu" "gfx900"
 // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]]
+
+// SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll"
+// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// SAVETEMP-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.ll"
+
+// FAIL: error: cannot specify -o when generating multiple output files

diff  --git a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
index afa7c292a53f1..a049ae725c89a 100644
--- a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
+++ b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
@@ -117,6 +117,9 @@ static cl::opt<unsigned>
 /// The index of the host input in the list of inputs.
 static unsigned HostInputIndex = ~0u;
 
+/// Whether not having host target is allowed.
+static bool AllowNoHost = false;
+
 /// Path to the current binary.
 static std::string BundlerExecutable;
 
@@ -839,9 +842,10 @@ static Error BundleFiles() {
   }
 
   // Get the file handler. We use the host buffer as reference.
-  assert(HostInputIndex != ~0u && "Host input index undefined??");
+  assert((HostInputIndex != ~0u || AllowNoHost) &&
+         "Host input index undefined??");
   Expected<std::unique_ptr<FileHandler>> FileHandlerOrErr =
-      CreateFileHandler(*InputBuffers[HostInputIndex]);
+      CreateFileHandler(*InputBuffers[AllowNoHost ? 0 : HostInputIndex]);
   if (!FileHandlerOrErr)
     return FileHandlerOrErr.takeError();
 
@@ -1108,6 +1112,7 @@ int main(int argc, const char **argv) {
   // have exactly one host target.
   unsigned Index = 0u;
   unsigned HostTargetNum = 0u;
+  bool HIPOnly = true;
   llvm::DenseSet<StringRef> ParsedTargets;
   for (StringRef Target : TargetNames) {
     if (ParsedTargets.contains(Target)) {
@@ -1149,12 +1154,21 @@ int main(int argc, const char **argv) {
       HostInputIndex = Index;
     }
 
+    if (Kind != "hip" && Kind != "hipv4")
+      HIPOnly = false;
+
     ++Index;
   }
 
+  // HIP uses clang-offload-bundler to bundle device-only compilation results
+  // for multiple GPU archs, therefore allow no host target if all entries
+  // are for HIP.
+  AllowNoHost = HIPOnly;
+
   // Host triple is not really needed for unbundling operation, so do not
   // treat missing host triple as error if we do unbundling.
-  if ((Unbundle && HostTargetNum > 1) || (!Unbundle && HostTargetNum != 1)) {
+  if ((Unbundle && HostTargetNum > 1) ||
+      (!Unbundle && HostTargetNum != 1 && !AllowNoHost)) {
     reportError(createStringError(errc::invalid_argument,
                                   "expecting exactly one host target but got " +
                                       Twine(HostTargetNum)));