[clang] 98ab43a - [HIP] Fix device only linking for -fgpu-rdc

Mon Jan 10 14:38:27 PST 2022

Author: Yaxun (Sam) Liu
Date: 2022-01-10T17:38:02-05:00
New Revision: 98ab43a1d209875ce9cc39420d3e6af57ed0f5b2

URL: https://github.com/llvm/llvm-project/commit/98ab43a1d209875ce9cc39420d3e6af57ed0f5b2
DIFF: https://github.com/llvm/llvm-project/commit/98ab43a1d209875ce9cc39420d3e6af57ed0f5b2.diff

LOG: [HIP] Fix device only linking for -fgpu-rdc

Currently when -fgpu-rdc is specified, HIP toolchain always does host linking even
if --cuda-device-only is specified.

This patch fixes that. Only device linking is performed when --cuda-device-only
is specified.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D116840

Added: 
    

Modified: 
    clang/lib/Driver/Driver.cpp
    clang/test/Driver/hip-phases.hip
    clang/test/Driver/hip-toolchain-rdc-separate.hip

Removed: 
    


################################################################################
diff  --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index bb7ccf7dd97eb..ac8438bb45a6a 100644

--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -3110,7 +3110,7 @@ class OffloadingActionBuilder final {
         // We will pass the device action as a host dependence, so we don't
         // need to do anything else with them.
         CudaDeviceActions.clear();
-        return ABRT_Success;
+        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
       }
 
       // By default, we produce an action for each device arch.
@@ -3143,6 +3143,7 @@ class OffloadingActionBuilder final {
       assert(DeviceLinkerInputs.size() == GpuArchList.size() &&
              "Linker inputs and GPU arch list sizes do not match.");
 
+      ActionList Actions;
       // Append a new link action for each device.
       unsigned I = 0;
       for (auto &LI : DeviceLinkerInputs) {
@@ -3154,22 +3155,29 @@ class OffloadingActionBuilder final {
         OffloadAction::DeviceDependences DeviceLinkDeps;
         DeviceLinkDeps.add(*DeviceLinkAction, *ToolChains[0],
             GpuArchList[I], AssociatedOffloadKind);
-        AL.push_back(C.MakeAction<OffloadAction>(DeviceLinkDeps,
-            DeviceLinkAction->getType()));
+        Actions.push_back(C.MakeAction<OffloadAction>(
+            DeviceLinkDeps, DeviceLinkAction->getType()));
         ++I;
       }
       DeviceLinkerInputs.clear();
 
       // Create a host object from all the device images by embedding them
-      // in a fat binary.
+      // in a fat binary for mixed host-device compilation. For device-only
+      // compilation, creates a fat binary.
       OffloadAction::DeviceDependences DDeps;
-      auto *TopDeviceLinkAction =
-          C.MakeAction<LinkJobAction>(AL, types::TY_Object);
-      DDeps.add(*TopDeviceLinkAction, *ToolChains[0],
-          nullptr, AssociatedOffloadKind);
-
-      // Offload the host object to the host linker.
-      AL.push_back(C.MakeAction<OffloadAction>(DDeps, TopDeviceLinkAction->getType()));
+      if (!CompileDeviceOnly || !BundleOutput.hasValue() ||
+          BundleOutput.getValue()) {
+        auto *TopDeviceLinkAction = C.MakeAction<LinkJobAction>(
+            Actions,
+            CompileDeviceOnly ? types::TY_HIP_FATBIN : types::TY_Object);
+        DDeps.add(*TopDeviceLinkAction, *ToolChains[0], nullptr,
+                  AssociatedOffloadKind);
+        // Offload the host object to the host linker.
+        AL.push_back(
+            C.MakeAction<OffloadAction>(DDeps, TopDeviceLinkAction->getType()));
+      } else {
+        AL.append(Actions);
+      }
     }
 
     Action* appendLinkHostActions(ActionList &AL) override { return AL.back(); }
@@ -3556,15 +3564,18 @@ class OffloadingActionBuilder final {
     return false;
   }
 
-  Action* makeHostLinkAction() {
-    // Build a list of device linking actions.
-    ActionList DeviceAL;
+  void appendDeviceLinkActions(ActionList &AL) {
     for (DeviceActionBuilder *SB : SpecializedBuilders) {
       if (!SB->isValid())
         continue;
-      SB->appendLinkDeviceActions(DeviceAL);
+      SB->appendLinkDeviceActions(AL);
     }
+  }
 
+  Action *makeHostLinkAction() {
+    // Build a list of device linking actions.
+    ActionList DeviceAL;
+    appendDeviceLinkActions(DeviceAL);
     if (DeviceAL.empty())
       return nullptr;
 
@@ -3893,6 +3904,13 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
   }
 
   // Add a link action if necessary.
+
+  if (LinkerInputs.empty()) {
+    Arg *FinalPhaseArg;
+    if (getFinalPhase(Args, &FinalPhaseArg) == phases::Link)
+      OffloadBuilder.appendDeviceLinkActions(Actions);
+  }
+
   if (!LinkerInputs.empty()) {
     if (Action *Wrapper = OffloadBuilder.makeHostLinkAction())
       LinkerInputs.push_back(Wrapper);

diff  --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 623299b13dd0c..93bf635dab63f 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -311,22 +311,36 @@
 //
 // RUN: %clang -target x86_64-unknown-linux-gnu -ccc-print-phases --hip-link \
 // RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %T/obj1.o %T/obj2.o \
-// RUN: -fgpu-rdc 2>&1 | FileCheck -check-prefixes=L2,RL2 %s
+// RUN: -fgpu-rdc 2>&1 | FileCheck -check-prefixes=L2,RL2,RL2-EM %s
 //
-// L2-DAG: [[P0:[0-9]+]]: input, "{{.*}}obj1.o", object, (host-[[T:hip]])
-// RL2-DAG: [[P1:[0-9]+]]: clang-offload-unbundler, {[[P0]]}, object, (host-[[T]])
-// L2-DAG: [[P2:[0-9]+]]: input, "{{.*}}obj2.o", object, (host-[[T]])
-// RL2-DAG: [[P3:[0-9]+]]: clang-offload-unbundler, {[[P2]]}, object, (host-[[T]])
+// RUN: %clang -target x86_64-unknown-linux-gnu -ccc-print-phases --hip-link \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %T/obj1.o %T/obj2.o \
+// RUN: -fgpu-rdc --cuda-device-only 2>&1 | FileCheck -check-prefixes=L2,RL2,RL2-DEV %s
 
-// RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]]}, image, (device-[[T]], [[ARCH1:gfx803]])
+// RUN: %clang -target x86_64-unknown-linux-gnu -ccc-print-phases --hip-link \
+// RUN: --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 %T/obj1.o %T/obj2.o \
+// RUN: -fgpu-rdc --cuda-device-only --no-gpu-bundle-output 2>&1 \
+// RUN: | FileCheck -check-prefixes=L2,RL2,RL2-NB %s
+
+// L2-DAG: [[P0:[0-9]+]]: input, "{{.*}}obj1.o", object
+// RL2-DAG: [[P1:[0-9]+]]: clang-offload-unbundler, {[[P0]]}, object
+// L2-DAG: [[P2:[0-9]+]]: input, "{{.*}}obj2.o", object
+// RL2-DAG: [[P3:[0-9]+]]: clang-offload-unbundler, {[[P2]]}, object
+
+// RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]]}, image, (device-[[T:hip]], [[ARCH1:gfx803]])
 // RL2-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH1]])" {[[P4]]}, image
 // RL2-DAG: [[P6:[0-9]+]]: linker, {[[P1]], [[P3]]}, image, (device-[[T]], [[ARCH2:gfx900]])
 // RL2-DAG: [[P7:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH2]])" {[[P6]]}, image
-// RL2-DAG: [[P8:[0-9]+]]: linker, {[[P5]], [[P7]]}, object, (device-[[T]])
-// RL2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, object
-
-// NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T]])
-// RL2-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]], [[P9]]}, image, (host-[[T]])
+// RL2-DEV-DAG: [[P8:[0-9]+]]: linker, {[[P5]], [[P7]]}, hip-fatbin, (device-[[T]])
+// RL2-DEV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, hip-fatbin
+// RL2-EM-DAG: [[P8:[0-9]+]]: linker, {[[P5]], [[P7]]}, object, (device-[[T]])
+// RL2-EM-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P8]]}, object
+// RL2-NB-NOT: linker
+// RL2-NB-NOT: offload
+
+// NL2-DAG: [[P4:[0-9]+]]: linker, {[[P0]], [[P2]]}, image, (host-[[T:hip]])
+// RL2-EM-DAG: [[P4:[0-9]+]]: linker, {[[P1]], [[P3]], [[P9]]}, image, (host-[[T]])
+// RL2-DEV-NOT: linker
 
 // Test one gpu architectures up to the preprocessor expansion output phase in device-only
 // compilation mode. no bundle.

diff  --git a/clang/test/Driver/hip-toolchain-rdc-separate.hip b/clang/test/Driver/hip-toolchain-rdc-separate.hip
index 698ee14e74dc9..a0cf0493832cd 100644
--- a/clang/test/Driver/hip-toolchain-rdc-separate.hip
+++ b/clang/test/Driver/hip-toolchain-rdc-separate.hip
@@ -88,47 +88,66 @@
 // RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   -fuse-ld=lld -fgpu-rdc -nogpuinc \
 // RUN:   %T/a.o %T/b.o \
-// RUN: 2>&1 | FileCheck -check-prefix=LINK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=LINK,LINK-HOST-UNBUNDLE,LLD-TMP,LINK-BUNDLE,LINK-EMBED %s
 
-// LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// LINK-SAME: "-inputs=[[A_O:.*a.o]]" "-outputs=[[A_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
-// LINK: "-unbundle" "-allow-missing-bundles"
+// RUN: %clang --hip-link -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
+// RUN:   -fuse-ld=lld -fgpu-rdc -nogpuinc \
+// RUN:   %T/a.o %T/b.o --cuda-device-only \
+// RUN: 2>&1 | FileCheck -check-prefixes=LINK,LLD-TMP,LINK-BUNDLE,LINK-NOEMBED %s
 
-// LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// LINK-SAME: "-inputs=[[B_O:.*b.o]]" "-outputs=[[B_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
-// LINK: "-unbundle" "-allow-missing-bundles"
+// RUN: %clang --hip-link -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
+// RUN:   -fuse-ld=lld -fgpu-rdc -nogpuinc \
+// RUN:   %T/a.o %T/b.o --cuda-device-only --no-gpu-bundle-output \
+// RUN: 2>&1 | FileCheck -check-prefixes=LINK,LLD-FIN,LINK-NOBUNDLE,LINK-NOEMBED %s
+
+// LINK-HOST-UNBUNDLE: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// LINK-HOST-UNBUNDLE-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// LINK-HOST-UNBUNDLE-SAME: "-inputs=[[A_O:.*a.o]]" "-outputs=[[A_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
+// LINK-HOST-UNBUNDLE: "-unbundle" "-allow-missing-bundles"
+
+// LINK-HOST-UNBUNDLE: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// LINK-HOST-UNBUNDLE-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
+// LINK-HOST-UNBUNDLE-SAME: "-inputs=[[B_O:.*b.o]]" "-outputs=[[B_OBJ_HOST:.*o]],{{.*o}},{{.*o}}"
+// LINK-HOST-UNBUNDLE: "-unbundle" "-allow-missing-bundles"
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
 // LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// LINK-SAME: "-inputs=[[A_O]]" "-outputs={{.*o}},[[A_BC1:.*o]],[[A_BC2:.*o]]"
-// LINK: "-unbundle" "-allow-missing-bundles"
+// LINK-SAME: "-inputs=[[A_O:.*a.o]]" "-outputs={{.*o}},[[A_BC1:.*o]],[[A_BC2:.*o]]"
+// LINK-SAME: "-unbundle" "-allow-missing-bundles"
 
 // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
 // LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
-// LINK-SAME: "-inputs=[[B_O]]" "-outputs={{.*o}},[[B_BC1:.*o]],[[B_BC2:.*o]]"
-// LINK: "-unbundle" "-allow-missing-bundles"
+// LINK-SAME: "-inputs=[[B_O:.*b.o]]" "-outputs={{.*o}},[[B_BC1:.*o]],[[B_BC2:.*o]]"
+// LINK-SAME: "-unbundle" "-allow-missing-bundles"
 
 // LINK-NOT: "*.llvm-link"
 // LINK-NOT: ".*opt"
 // LINK-NOT: ".*llc"
 // LINK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
-// LINK: "-plugin-opt=mcpu=gfx803"
-// LINK-SAME: "-o" "[[IMG_DEV1:.*.out]]" "[[A_BC1]]" "[[B_BC1]]"
+// LINK-SAME: "-plugin-opt=mcpu=gfx803"
+// LLD-TMP-SAME: "-o" "[[IMG_DEV1:.*.out]]"
+// LLD-FIN-SAME: "-o" "[[IMG_DEV1:a.out-.*gfx803]]"
+// LINK-SAME "[[A_BC1]]" "[[B_BC1]]"
 
 // LINK-NOT: "*.llvm-link"
 // LINK-NOT: ".*opt"
 // LINK-NOT: ".*llc"
 // LINK: {{".*lld.*"}} {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
-// LINK: "-plugin-opt=mcpu=gfx900"
-// LINK-SAME: "-o" "[[IMG_DEV2:.*.out]]" "[[A_BC2]]" "[[B_BC2]]"
-
-// LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// LINK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// LINK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
-
-// LINK: {{".*llvm-mc.*"}} "-o" "[[OBJBUNDLE:.*o]]" "{{.*}}.mcin" "--filetype=obj"
-
-// LINK: [[LD:".*ld.*"]] {{.*}} "-o" "a.out" {{.*}} "[[A_OBJ_HOST]]"
-// LINK-SAME: "[[B_OBJ_HOST]]" "[[OBJBUNDLE]]"
+// LINK-SAME: "-plugin-opt=mcpu=gfx900"
+// LLD-TMP-SAME: "-o" "[[IMG_DEV2:.*.out]]"
+// LLD-FIN-SAME: "-o" "[[IMG_DEV1:a.out-.*gfx900]]"
+// LINK-SAME "[[A_BC2]]" "[[B_BC2]]"
+
+// LINK-BUNDLE: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// LINK-BUNDLE-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// LINK-BUNDLE-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*]]"
+// LINK-NOBUNDLE-NOT: {{".*clang-offload-bundler"}} "-type=o"
+
+// LINK-EMBED: {{".*llvm-mc.*"}} "-o" "[[OBJBUNDLE:.*o]]" "{{.*}}.mcin" "--filetype=obj"
+// LINK-NOEMBED-NOT: {{".*llvm-mc.*"}} "-o"
+
+// LINK-EMBED: [[LD:".*ld.*"]] {{.*}} "-o" "a.out" {{.*}} "[[A_OBJ_HOST]]"
+// LINK-EMBED-SAME: "[[B_OBJ_HOST]]" "[[OBJBUNDLE]]"
+// LINK-NOEMBED-NOT: {{".*ld.*"}} {{.*}} "-o" "a.out"