[clang] bec49b1 - [LinkerWrapper] Use `clang` to perform the device linking

Thu Jan 19 10:47:58 PST 2023

Author: Joseph Huber
Date: 2023-01-19T12:47:51-06:00
New Revision: bec49b1d803ced901e14aca154ffc106939100e5

URL: https://github.com/llvm/llvm-project/commit/bec49b1d803ced901e14aca154ffc106939100e5
DIFF: https://github.com/llvm/llvm-project/commit/bec49b1d803ced901e14aca154ffc106939100e5.diff

LOG: [LinkerWrapper] Use `clang` to perform the device linking

Right now in the linker wrapper we manually invoke a lot of the
toolchain programs. This reproduces a lot of logic that is already
handled in clang. Since D140158 we can now target all supported
toolchains directly via cross-compilation.

This patch changes the linker wrapper to consolidate all the alternate
linking and assembler steps into a generic call to `clang` and let clang
handle the argument handling. This heavily simplifies the interface.

Reviewed By: tra, JonChesterfield

Differential Revision: https://reviews.llvm.org/D142133

Added: 
    

Modified: 
    clang/test/Driver/linker-wrapper.c
    clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Removed: 
    


################################################################################
diff  --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index b052c077b92cd..d515e316da0f4 100644

--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -12,17 +12,11 @@
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK
-
-// NVPTX-LINK: nvlink{{.*}}-m64 -o {{.*}}.img -arch sm_70 {{.*}}.o {{.*}}.o
-
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm-bc -o %t.o -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HOST-BC
+// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK
 
-// HOST-BC: nvlink{{.*}}-m64 -o {{.*}}.img -arch sm_70 {{.*}}.o {{.*}}.o
+// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
@@ -31,16 +25,7 @@
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-debug -O0 \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK-DEBUG
 
-// NVPTX-LINK-DEBUG: nvlink{{.*}}-m64 -g -o {{.*}}.img -arch sm_70 {{.*}}.o {{.*}}.o
-
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.nvptx.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.nvptx.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-debug -O2 \
-// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK-DEBUG-LTO
-
-// NVPTX-LINK-DEBUG-LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 -lineinfo {{.*}}.s
+// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -Wl,--no-undefined -g {{.*}}.o {{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
@@ -49,7 +34,7 @@
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LINK
 
-// AMDGPU-LINK: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.img {{.*}}.o {{.*}}.o
+// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \
@@ -58,17 +43,7 @@
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --save-temps -O2 \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LTO-TEMPS
 
-// AMDGPU-LTO-TEMPS: clang{{.*}}-o [[OBJ:.+]] -fPIC -c --target=amdgcn-amd-amdhsa -O2 -mcpu=gfx1030 {{.*}}.s
-// AMDGPU-LTO-TEMPS: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx1030 -o {{.*}}.img {{.*}}.o
-
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
-// RUN:   --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LINK-LTO
-
-// AMDGPU-LINK-LTO: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.img {{.*}}.o
+// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -Wl,--no-undefined -save-temps {{.*}}.s
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \
@@ -77,7 +52,7 @@
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
 // RUN:   --linker-path=/usr/bin/ld.lld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CPU-LINK
 
-// CPU-LINK: ld.lld{{.*}}-m elf_x86_64 -shared -Bsymbolic -o {{.*}}.img {{.*}}.o {{.*}}.o
+// CPU-LINK: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu -march=native -O2 -Wl,--no-undefined -Bsymbolic -shared {{.*}}.o {{.*}}.o
 
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -mllvm -openmp-opt-disable \
@@ -86,25 +61,6 @@
 // HOST-LINK: ld.lld{{.*}}-a -b -c {{.*}}.o -o a.out
 // HOST-LINK-NOT: ld.lld{{.*}}-abc
 
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.nvptx.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.nvptx.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LTO
-
-// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s
-// LTO-NOT: nvlink
-
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_70
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-OMP-LINK
-
-// CUDA-OMP-LINK: nvlink{{.*}}-m64 -o {{.*}}.img -arch sm_70 {{.*}}.o {{.*}}.o
-
 // RUN: clang-offload-packager -o %t-lib.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
 // RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52
@@ -116,8 +72,8 @@
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
 // RUN:   --linker-path=/usr/bin/ld -- %t.a %t-obj.o -o a.out 2>&1 | FileCheck %s --check-prefix=STATIC-LIBRARY
 
-// STATIC-LIBRARY: nvlink{{.*}} -arch sm_70
-// STATIC-LIBRARY-NOT: nvlink{{.*}} -arch sm_50
+// STATIC-LIBRARY: clang{{.*}} -march=sm_70
+// STATIC-LIBRARY-NOT: clang{{.*}} -march=sm_50
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_70 \
@@ -128,9 +84,9 @@
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \
 // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA
 
-// CUDA: nvlink{{.*}}-m64 -o {{.*}}.img -arch sm_52 {{.*}}.o
-// CUDA: nvlink{{.*}}-m64 -o {{.*}}.img -arch sm_70 {{.*}}.o {{.*}}.o
-// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_70,file={{.*}}.img --image=profile=sm_52,file={{.*}}.img 
+// CUDA: clang{{.*}} -o [[IMG_SM52:.+]] --target=nvptx64-nvidia-cuda -march=sm_52
+// CUDA: clang{{.*}} -o [[IMG_SM70:.+]] --target=nvptx64-nvidia-cuda -march=sm_70
+// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_70,file=[[IMG_SM70]] --image=profile=sm_52,file=[[IMG_SM52]] 
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_80 \
@@ -153,9 +109,9 @@
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP
 
-// HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.img {{.*}}.o
-// HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -o {{.*}}.img {{.*}}.o
-// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx908 -input=/dev/null -input={{.*}}.img -input={{.*}}.img -output={{.*}}.hipfb
+// HIP: clang{{.*}} -o [[IMG_GFX908:.+]] --target=amdgcn-amd-amdhsa -mcpu=gfx908
+// HIP: clang{{.*}} -o [[IMG_GFX90A:.+]] --target=amdgcn-amd-amdhsa -mcpu=gfx90a
+// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx908 -input=/dev/null -input=[[IMG_GFX90A]] -input=[[IMG_GFX908]] -output={{.*}}.hipfb
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
@@ -166,8 +122,8 @@
 // RUN:   --linker-path=/usr/bin/ld --device-linker=a --device-linker=nvptx64-nvidia-cuda=b -- \
 // RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LINKER-ARGS
 
-// LINKER-ARGS: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.img {{.*}}.o a
-// LINKER-ARGS: nvlink{{.*}}-m64 -o {{.*}}.img -arch sm_70 {{.*}}.o a b
+// LINKER-ARGS: clang{{.*}}--target=amdgcn-amd-amdhsa{{.*}}-Wl,a
+// LINKER-ARGS: clang{{.*}}--target=nvptx64-nvidia-cuda{{.*}}-Wl,a -Wl,b
 
 // RUN: not clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -ldummy \
 // RUN:   --linker-path=/usr/bin/ld --device-linker=a --device-linker=nvptx64-nvidia-cuda=b -- \

diff  --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 1daa58f20fd5b..5cae73ef543eb 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -271,99 +271,13 @@ void printVersion(raw_ostream &OS) {
 }
 
 namespace nvptx {
-Expected<StringRef> assemble(StringRef InputFile, const ArgList &Args,
-                             bool RDC = true) {
-  llvm::TimeTraceScope TimeScope("NVPTX Assembler");
-  // NVPTX uses the ptxas binary to create device object files.
-  Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath});
-  if (!PtxasPath)
-    return PtxasPath.takeError();
-
-  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
-  StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
-  // Create a new file to write the linked device image to. Assume that the
-  // input filename already has the device and architecture.
-  auto TempFileOrErr = createOutputFile(sys::path::stem(InputFile), "cubin");
-  if (!TempFileOrErr)
-    return TempFileOrErr.takeError();
-
-  SmallVector<StringRef, 16> CmdArgs;
-  StringRef OptLevel = Args.getLastArgValue(OPT_opt_level, "O2");
-  CmdArgs.push_back(*PtxasPath);
-  CmdArgs.push_back(Triple.isArch64Bit() ? "-m64" : "-m32");
-  if (Verbose)
-    CmdArgs.push_back("-v");
-  for (StringRef Arg : Args.getAllArgValues(OPT_ptxas_arg))
-    CmdArgs.push_back(Args.MakeArgString(Arg));
-  CmdArgs.push_back("-o");
-  CmdArgs.push_back(*TempFileOrErr);
-  CmdArgs.push_back(Args.MakeArgString("-" + OptLevel));
-  CmdArgs.push_back("--gpu-name");
-  CmdArgs.push_back(Arch);
-  if (Args.hasArg(OPT_debug) && OptLevel[1] == '0')
-    CmdArgs.push_back("-g");
-  else if (Args.hasArg(OPT_debug))
-    CmdArgs.push_back("-lineinfo");
-  if (RDC)
-    CmdArgs.push_back("-c");
-
-  CmdArgs.push_back(InputFile);
-
-  if (Error Err = executeCommands(*PtxasPath, CmdArgs))
-    return std::move(Err);
-
-  return *TempFileOrErr;
-}
-
-Expected<StringRef> link(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
-  llvm::TimeTraceScope TimeScope("NVPTX linker");
-  // NVPTX uses the nvlink binary to link device object files.
-  Expected<std::string> NvlinkPath = findProgram("nvlink", {CudaBinaryPath});
-  if (!NvlinkPath)
-    return NvlinkPath.takeError();
-
-  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
-  StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
-
-  // Create a new file to write the linked device image to.
-  auto TempFileOrErr =
-      createOutputFile(sys::path::filename(ExecutableName) + "." +
-                           Triple.getArchName() + "." + Arch,
-                       "img");
-  if (!TempFileOrErr)
-    return TempFileOrErr.takeError();
-
-  SmallVector<StringRef, 16> CmdArgs;
-  CmdArgs.push_back(*NvlinkPath);
-  CmdArgs.push_back(Triple.isArch64Bit() ? "-m64" : "-m32");
-  if (Args.hasArg(OPT_debug))
-    CmdArgs.push_back("-g");
-  if (Verbose)
-    CmdArgs.push_back("-v");
-  CmdArgs.push_back("-o");
-  CmdArgs.push_back(*TempFileOrErr);
-  CmdArgs.push_back("-arch");
-  CmdArgs.push_back(Arch);
-
-  // Add extracted input files.
-  for (StringRef Input : InputFiles)
-    CmdArgs.push_back(Input);
-
-  for (StringRef Arg : Args.getAllArgValues(OPT_linker_arg_EQ))
-    CmdArgs.push_back(Args.MakeArgString(Arg));
-  if (Error Err = executeCommands(*NvlinkPath, CmdArgs))
-    return std::move(Err);
-
-  return *TempFileOrErr;
-}
-
 Expected<StringRef>
 fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
           const ArgList &Args) {
   llvm::TimeTraceScope TimeScope("NVPTX fatbinary");
   // NVPTX uses the fatbinary program to bundle the linked images.
   Expected<std::string> FatBinaryPath =
-      findProgram("fatbinary", {CudaBinaryPath});
+      findProgram("fatbinary", {CudaBinaryPath + "/bin"});
   if (!FatBinaryPath)
     return FatBinaryPath.takeError();
 
@@ -393,49 +307,6 @@ fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
 } // namespace nvptx
 
 namespace amdgcn {
-Expected<StringRef> link(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
-  llvm::TimeTraceScope TimeScope("AMDGPU linker");
-  // AMDGPU uses lld to link device object files.
-  Expected<std::string> LLDPath =
-      findProgram("lld", {getMainExecutable("lld")});
-  if (!LLDPath)
-    return LLDPath.takeError();
-
-  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
-  StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
-
-  // Create a new file to write the linked device image to.
-  auto TempFileOrErr =
-      createOutputFile(sys::path::filename(ExecutableName) + "." +
-                           Triple.getArchName() + "." + Arch,
-                       "img");
-  if (!TempFileOrErr)
-    return TempFileOrErr.takeError();
-  std::string ArchArg = ("-plugin-opt=mcpu=" + Arch).str();
-
-  SmallVector<StringRef, 16> CmdArgs;
-  CmdArgs.push_back(*LLDPath);
-  CmdArgs.push_back("-flavor");
-  CmdArgs.push_back("gnu");
-  CmdArgs.push_back("--no-undefined");
-  CmdArgs.push_back("-shared");
-  CmdArgs.push_back("-plugin-opt=-amdgpu-internalize-symbols");
-  CmdArgs.push_back(ArchArg);
-  CmdArgs.push_back("-o");
-  CmdArgs.push_back(*TempFileOrErr);
-
-  // Add extracted input files.
-  for (StringRef Input : InputFiles)
-    CmdArgs.push_back(Input);
-
-  for (StringRef Arg : Args.getAllArgValues(OPT_linker_arg_EQ))
-    CmdArgs.push_back(Args.MakeArgString(Arg));
-  if (Error Err = executeCommands(*LLDPath, CmdArgs))
-    return std::move(Err);
-
-  return *TempFileOrErr;
-}
-
 Expected<StringRef>
 fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
           const ArgList &Args) {
@@ -483,35 +354,9 @@ fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
 } // namespace amdgcn
 
 namespace generic {
-
-const char *getLDMOption(const llvm::Triple &T) {
-  switch (T.getArch()) {
-  case llvm::Triple::x86:
-    if (T.isOSIAMCU())
-      return "elf_iamcu";
-    return "elf_i386";
-  case llvm::Triple::aarch64:
-    return "aarch64linux";
-  case llvm::Triple::aarch64_be:
-    return "aarch64linuxb";
-  case llvm::Triple::ppc64:
-    return "elf64ppc";
-  case llvm::Triple::ppc64le:
-    return "elf64lppc";
-  case llvm::Triple::x86_64:
-    if (T.isX32())
-      return "elf32_x86_64";
-    return "elf_x86_64";
-  case llvm::Triple::ve:
-    return "elf64ve";
-  default:
-    return nullptr;
-  }
-}
-
-Expected<StringRef> assemble(StringRef InputFile, const ArgList &Args) {
-  llvm::TimeTraceScope TimeScope("Clang Assembler");
-  // Use `clang` to invoke the generic assembler.
+Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
+  llvm::TimeTraceScope TimeScope("Clang");
+  // Use `clang` to invoke the appropriate device tools.
   Expected<std::string> ClangPath =
       findProgram("clang", {getMainExecutable("clang")});
   if (!ClangPath)
@@ -519,9 +364,14 @@ Expected<StringRef> assemble(StringRef InputFile, const ArgList &Args) {
 
   const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
   StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
+  if (Arch.empty())
+    Arch = "native";
   // Create a new file to write the linked device image to. Assume that the
   // input filename already has the device and architecture.
-  auto TempFileOrErr = createOutputFile(sys::path::stem(InputFile), "o");
+  auto TempFileOrErr =
+      createOutputFile(sys::path::filename(ExecutableName) + "." +
+                           Triple.getArchName() + "." + Arch,
+                       "img");
   if (!TempFileOrErr)
     return TempFileOrErr.takeError();
 
@@ -530,65 +380,47 @@ Expected<StringRef> assemble(StringRef InputFile, const ArgList &Args) {
       *ClangPath,
       "-o",
       *TempFileOrErr,
-      "-fPIC",
-      "-c",
       Args.MakeArgString("--target=" + Triple.getTriple()),
-      Args.MakeArgString("-" + OptLevel),
       Triple.isAMDGPU() ? Args.MakeArgString("-mcpu=" + Arch)
                         : Args.MakeArgString("-march=" + Arch),
-      InputFile,
+      Args.MakeArgString("-" + OptLevel),
+      "-Wl,--no-undefined",
   };
 
-  if (Error Err = executeCommands(*ClangPath, CmdArgs))
-    return std::move(Err);
+  // If this is CPU offloading we copy the input libraries.
+  if (!Triple.isAMDGPU() && !Triple.isNVPTX()) {
+    CmdArgs.push_back("-Bsymbolic");
+    CmdArgs.push_back("-shared");
+    ArgStringList LinkerArgs;
+    for (const opt::Arg *Arg :
+         Args.filtered(OPT_library, OPT_rpath, OPT_library_path))
+      Arg->render(Args, LinkerArgs);
+    llvm::copy(LinkerArgs, std::back_inserter(CmdArgs));
+  }
 
-  return *TempFileOrErr;
-}
+  if (Args.hasArg(OPT_debug))
+    CmdArgs.push_back("-g");
 
-Expected<StringRef> link(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
-  llvm::TimeTraceScope TimeScope("Generic linker");
-  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
-  StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
+  if (SaveTemps)
+    CmdArgs.push_back("-save-temps");
 
-  // Create a new file to write the linked device image to.
-  auto TempFileOrErr =
-      createOutputFile(sys::path::filename(ExecutableName) + "." +
-                           Triple.getArchName() + "." + Arch,
-                       "img");
-  if (!TempFileOrErr)
-    return TempFileOrErr.takeError();
+  if (Verbose)
+    CmdArgs.push_back("-v");
 
-  // Use the host linker to perform generic offloading. Use the same libraries
-  // and paths as the host application does.
-  SmallVector<StringRef, 16> CmdArgs;
-  CmdArgs.push_back(Args.getLastArgValue(OPT_linker_path_EQ));
-  CmdArgs.push_back("-m");
-  CmdArgs.push_back(getLDMOption(Triple));
-  CmdArgs.push_back("-shared");
+  if (!CudaBinaryPath.empty())
+    CmdArgs.push_back(Args.MakeArgString("--cuda-path=" + CudaBinaryPath));
 
-  ArgStringList LinkerArgs;
-  for (const opt::Arg *Arg : Args) {
-    auto Op = Arg->getOption();
-    if (Op.matches(OPT_library) || Op.matches(OPT_library_path) ||
-        Op.matches(OPT_as_needed) || Op.matches(OPT_no_as_needed) ||
-        Op.matches(OPT_rpath) || Op.matches(OPT_dynamic_linker))
-      Arg->render(Args, LinkerArgs);
-  }
-  for (StringRef Arg : LinkerArgs)
-    CmdArgs.push_back(Arg);
+  for (StringRef Arg : Args.getAllArgValues(OPT_ptxas_arg))
+    llvm::copy(SmallVector<StringRef>({"-Xcuda-ptxas", Arg}),
+               std::back_inserter(CmdArgs));
 
-  CmdArgs.push_back("-Bsymbolic");
-  CmdArgs.push_back("-o");
-  CmdArgs.push_back(*TempFileOrErr);
+  for (StringRef Arg : Args.getAllArgValues(OPT_linker_arg_EQ))
+    CmdArgs.push_back(Args.MakeArgString("-Wl," + Arg));
 
-  // Add extracted input files.
-  for (StringRef Input : InputFiles)
-    CmdArgs.push_back(Input);
+  for (StringRef InputFile : InputFiles)
+    CmdArgs.push_back(InputFile);
 
-  for (StringRef Arg : Args.getAllArgValues(OPT_linker_arg_EQ))
-    CmdArgs.push_back(Args.MakeArgString(Arg));
-  if (Error Err =
-          executeCommands(Args.getLastArgValue(OPT_linker_path_EQ), CmdArgs))
+  if (Error Err = executeCommands(*ClangPath, CmdArgs))
     return std::move(Err);
 
   return *TempFileOrErr;
@@ -601,16 +433,14 @@ Expected<StringRef> linkDevice(ArrayRef<StringRef> InputFiles,
   switch (Triple.getArch()) {
   case Triple::nvptx:
   case Triple::nvptx64:
-    return nvptx::link(InputFiles, Args);
   case Triple::amdgcn:
-    return amdgcn::link(InputFiles, Args);
   case Triple::x86:
   case Triple::x86_64:
   case Triple::aarch64:
   case Triple::aarch64_be:
   case Triple::ppc64:
   case Triple::ppc64le:
-    return generic::link(InputFiles, Args);
+    return generic::clang(InputFiles, Args);
   default:
     return createStringError(inconvertibleErrorCode(),
                              Triple.getArchName() +
@@ -933,19 +763,6 @@ Error linkBitcodeFiles(SmallVectorImpl<OffloadFile> &InputFiles,
     return Error::success();
   }
 
-  // Is we are compiling for NVPTX we need to run the assembler first.
-  if (Triple.isNVPTX() || SaveTemps) {
-    for (StringRef &File : Files) {
-
-      auto FileOrErr = Triple.isNVPTX()
-                           ? nvptx::assemble(File, Args, !SingleOutput)
-                           : generic::assemble(File, Args);
-      if (!FileOrErr)
-        return FileOrErr.takeError();
-      File = *FileOrErr;
-    }
-  }
-
   // Append the new inputs to the device linker input.
   for (StringRef File : Files)
     OutputFiles.push_back(File);
@@ -1226,12 +1043,9 @@ linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
     }
 
     // Link the remaining device files using the device linker.
-    llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ));
-    bool RequiresLinking =
-        !Args.hasArg(OPT_embed_bitcode) &&
-        !(Input.empty() && InputFiles.size() == 1 && Triple.isNVPTX());
-    auto OutputOrErr = RequiresLinking ? linkDevice(InputFiles, LinkerArgs)
-                                       : InputFiles.front();
+    auto OutputOrErr = !Args.hasArg(OPT_embed_bitcode)
+                           ? linkDevice(InputFiles, LinkerArgs)
+                           : InputFiles.front();
     if (!OutputOrErr)
       return OutputOrErr.takeError();
 
@@ -1457,8 +1271,6 @@ int main(int Argc, char **Argv) {
   SaveTemps = Args.hasArg(OPT_save_temps);
   ExecutableName = Args.getLastArgValue(OPT_o, "a.out");
   CudaBinaryPath = Args.getLastArgValue(OPT_cuda_path_EQ).str();
-  if (!CudaBinaryPath.empty())
-    CudaBinaryPath = CudaBinaryPath + "/bin";
 
   parallel::strategy = hardware_concurrency(1);
   if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) {