[clang] 4c41170 - [Clang][OpenMP] Add partial support for Static Device Libraries

Wed Oct 6 21:46:18 PDT 2021

Author: Saiyedul Islam
Date: 2021-10-07T04:45:19Z
New Revision: 4c4117089599cb5b6c6fa5635c28462ffd1bddf4

URL: https://github.com/llvm/llvm-project/commit/4c4117089599cb5b6c6fa5635c28462ffd1bddf4
DIFF: https://github.com/llvm/llvm-project/commit/4c4117089599cb5b6c6fa5635c28462ffd1bddf4.diff

LOG: [Clang][OpenMP] Add partial support for Static Device Libraries

An archive containing device code object files can be passed to
clang command line for linking. For each given offload target
it creates a device specific archives which is either passed to llvm-link
if the target is amdgpu, or to clang-nvlink-wrapper if the target is
nvptx. -L/-l flags are used to specify these fat archives on the command
line. E.g.
  clang++ -fopenmp -fopenmp-targets=nvptx64 main.cpp -L. -lmylib

It currently doesn't support linking an archive directly, like:
  clang++ -fopenmp -fopenmp-targets=nvptx64 main.cpp libmylib.a

Linking with x86 offload also does not work.

Reviewed By: ye-luo

Differential Revision: https://reviews.llvm.org/D105191

Added: 
    clang/test/Driver/Inputs/openmp_static_device_link/libFatArchive.a
    clang/test/Driver/fat_archive_amdgpu.cpp
    clang/test/Driver/fat_archive_nvptx.cpp

Modified: 
    clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/lib/Driver/ToolChains/CommonArgs.cpp
    clang/lib/Driver/ToolChains/CommonArgs.h
    clang/lib/Driver/ToolChains/Cuda.cpp
    clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index 135e3694434db..5400e26177291 100644

--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -114,6 +114,10 @@ const char *AMDGCN::OpenMPLinker::constructLLVMLinkCommand(
     }
   }
 
+  AddStaticDeviceLibsLinking(C, *this, JA, Inputs, Args, CmdArgs, "amdgcn",
+                      SubArchName,
+                      /* bitcode SDL?*/ true,
+                      /* PostClang Link? */ false);
   // Add an intermediate output file.
   CmdArgs.push_back("-o");
   const char *OutputFileName =

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 369c12aea5231..65dfe0ae0221d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7734,12 +7734,28 @@ void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
     Triples += Action::GetOffloadKindName(CurKind);
     Triples += '-';
     Triples += CurTC->getTriple().normalize();
-    if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_OpenMP ||
-         CurKind == Action::OFK_Cuda) &&
+    if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_Cuda) &&
         CurDep->getOffloadingArch()) {
       Triples += '-';
       Triples += CurDep->getOffloadingArch();
     }
+
+    // TODO: Replace parsing of -march flag. Can be done by storing GPUArch
+    //       with each toolchain.
+    StringRef GPUArchName;
+    if (CurKind == Action::OFK_OpenMP) {
+      // Extract GPUArch from -march argument in TC argument list.
+      for (unsigned ArgIndex = 0; ArgIndex < TCArgs.size(); ArgIndex++) {
+        auto ArchStr = StringRef(TCArgs.getArgString(ArgIndex));
+        auto Arch = ArchStr.startswith_insensitive("-march=");
+        if (Arch) {
+          GPUArchName = ArchStr.substr(7);
+          Triples += "-";
+          break;
+        }
+      }
+      Triples += GPUArchName.str();
+    }
   }
   CmdArgs.push_back(TCArgs.MakeArgString(Triples));
 
@@ -7813,12 +7829,27 @@ void OffloadBundler::ConstructJobMultipleOutputs(
     Triples += '-';
     Triples += Dep.DependentToolChain->getTriple().normalize();
     if ((Dep.DependentOffloadKind == Action::OFK_HIP ||
-         Dep.DependentOffloadKind == Action::OFK_OpenMP ||
          Dep.DependentOffloadKind == Action::OFK_Cuda) &&
         !Dep.DependentBoundArch.empty()) {
       Triples += '-';
       Triples += Dep.DependentBoundArch;
     }
+    // TODO: Replace parsing of -march flag. Can be done by storing GPUArch
+    //       with each toolchain.
+    StringRef GPUArchName;
+    if (Dep.DependentOffloadKind == Action::OFK_OpenMP) {
+      // Extract GPUArch from -march argument in TC argument list.
+      for (uint ArgIndex = 0; ArgIndex < TCArgs.size(); ArgIndex++) {
+        StringRef ArchStr = StringRef(TCArgs.getArgString(ArgIndex));
+        auto Arch = ArchStr.startswith_insensitive("-march=");
+        if (Arch) {
+          GPUArchName = ArchStr.substr(7);
+          Triples += "-";
+          break;
+        }
+      }
+      Triples += GPUArchName.str();
+    }
   }
 
   CmdArgs.push_back(TCArgs.MakeArgString(Triples));

diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 9f1895466c98d..c3abdf446cfaf 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -34,6 +34,7 @@
 #include "clang/Driver/Util.h"
 #include "clang/Driver/XRayArgs.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -1587,6 +1588,292 @@ void tools::addX86AlignBranchArgs(const Driver &D, const ArgList &Args,
   }
 }
 
+/// SDLSearch: Search for Static Device Library
+/// The search for SDL bitcode files is consistent with how static host
+/// libraries are discovered. That is, the -l option triggers a search for
+/// files in a set of directories called the LINKPATH. The host library search
+/// procedure looks for a specific filename in the LINKPATH.  The filename for
+/// a host library is lib<libname>.a or lib<libname>.so. For SDLs, there is an
+/// ordered-set of filenames that are searched. We call this ordered-set of
+/// filenames as SEARCH-ORDER. Since an SDL can either be device-type specific,
+/// architecture specific, or generic across all architectures, a naming
+/// convention and search order is used where the file name embeds the
+/// architecture name <arch-name> (nvptx or amdgcn) and the GPU device type
+/// <device-name> such as sm_30 and gfx906. <device-name> is absent in case of
+/// device-independent SDLs. To reduce congestion in host library directories,
+/// the search first looks for files in the “libdevice” subdirectory. SDLs that
+/// are bc files begin with the prefix “lib”.
+///
+/// Machine-code SDLs can also be managed as an archive (*.a file). The
+/// convention has been to use the prefix “lib”. To avoid confusion with host
+/// archive libraries, we use prefix "libbc-" for the bitcode SDL archives.
+///
+bool tools::SDLSearch(const Driver &D, const llvm::opt::ArgList &DriverArgs,
+                      llvm::opt::ArgStringList &CC1Args,
+                      SmallVector<std::string, 8> LibraryPaths, std::string Lib,
+                      StringRef Arch, StringRef Target, bool isBitCodeSDL,
+                      bool postClangLink) {
+  SmallVector<std::string, 12> SDLs;
+
+  std::string LibDeviceLoc = "/libdevice";
+  std::string LibBcPrefix = "/libbc-";
+  std::string LibPrefix = "/lib";
+
+  if (isBitCodeSDL) {
+    // SEARCH-ORDER for Bitcode SDLs:
+    //       libdevice/libbc-<libname>-<arch-name>-<device-type>.a
+    //       libbc-<libname>-<arch-name>-<device-type>.a
+    //       libdevice/libbc-<libname>-<arch-name>.a
+    //       libbc-<libname>-<arch-name>.a
+    //       libdevice/libbc-<libname>.a
+    //       libbc-<libname>.a
+    //       libdevice/lib<libname>-<arch-name>-<device-type>.bc
+    //       lib<libname>-<arch-name>-<device-type>.bc
+    //       libdevice/lib<libname>-<arch-name>.bc
+    //       lib<libname>-<arch-name>.bc
+    //       libdevice/lib<libname>.bc
+    //       lib<libname>.bc
+
+    for (StringRef Base : {LibBcPrefix, LibPrefix}) {
+      const auto *Ext = Base.contains(LibBcPrefix) ? ".a" : ".bc";
+
+      for (auto Suffix : {Twine(Lib + "-" + Arch + "-" + Target).str(),
+                          Twine(Lib + "-" + Arch).str(), Twine(Lib).str()}) {
+        SDLs.push_back(Twine(LibDeviceLoc + Base + Suffix + Ext).str());
+        SDLs.push_back(Twine(Base + Suffix + Ext).str());
+      }
+    }
+  } else {
+    // SEARCH-ORDER for Machine-code SDLs:
+    //    libdevice/lib<libname>-<arch-name>-<device-type>.a
+    //    lib<libname>-<arch-name>-<device-type>.a
+    //    libdevice/lib<libname>-<arch-name>.a
+    //    lib<libname>-<arch-name>.a
+
+    const auto *Ext = ".a";
+
+    for (auto Suffix : {Twine(Lib + "-" + Arch + "-" + Target).str(),
+                        Twine(Lib + "-" + Arch).str()}) {
+      SDLs.push_back(Twine(LibDeviceLoc + LibPrefix + Suffix + Ext).str());
+      SDLs.push_back(Twine(LibPrefix + Suffix + Ext).str());
+    }
+  }
+
+  // The CUDA toolchain does not use a global device llvm-link before the LLVM
+  // backend generates ptx. So currently, the use of bitcode SDL for nvptx is
+  // only possible with post-clang-cc1 linking. Clang cc1 has a feature that
+  // will link libraries after clang compilation while the LLVM IR is still in
+  // memory. This utilizes a clang cc1 option called “-mlink-builtin-bitcode”.
+  // This is a clang -cc1 option that is generated by the clang driver. The
+  // option value must a full path to an existing file.
+  bool FoundSDL = false;
+  for (auto LPath : LibraryPaths) {
+    for (auto SDL : SDLs) {
+      auto FullName = Twine(LPath + SDL).str();
+      if (llvm::sys::fs::exists(FullName)) {
+        if (postClangLink)
+          CC1Args.push_back("-mlink-builtin-bitcode");
+        CC1Args.push_back(DriverArgs.MakeArgString(FullName));
+        FoundSDL = true;
+        break;
+      }
+    }
+    if (FoundSDL)
+      break;
+  }
+  return FoundSDL;
+}
+
+/// Search if a user provided archive file lib<libname>.a exists in any of
+/// the library paths. If so, add a new command to clang-offload-bundler to
+/// unbundle this archive and create a temporary device specific archive. Name
+/// of this SDL is passed to the llvm-link (for amdgcn) or to the
+/// clang-nvlink-wrapper (for nvptx) commands by the driver.
+bool tools::GetSDLFromOffloadArchive(
+    Compilation &C, const Driver &D, const Tool &T, const JobAction &JA,
+    const InputInfoList &Inputs, const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args, SmallVector<std::string, 8> LibraryPaths,
+    StringRef Lib, StringRef Arch, StringRef Target, bool isBitCodeSDL,
+    bool postClangLink) {
+
+  // We don't support bitcode archive bundles for nvptx
+  if (isBitCodeSDL && Arch.contains("nvptx"))
+    return false;
+
+  bool FoundAOB = false;
+  SmallVector<std::string, 2> AOBFileNames;
+  std::string ArchiveOfBundles;
+  for (auto LPath : LibraryPaths) {
+    ArchiveOfBundles.clear();
+
+    AOBFileNames.push_back(Twine(LPath + "/libdevice/lib" + Lib + ".a").str());
+    AOBFileNames.push_back(Twine(LPath + "/lib" + Lib + ".a").str());
+
+    for (auto AOB : AOBFileNames) {
+      if (llvm::sys::fs::exists(AOB)) {
+        ArchiveOfBundles = AOB;
+        FoundAOB = true;
+        break;
+      }
+    }
+
+    if (!FoundAOB)
+      continue;
+
+    StringRef Prefix = isBitCodeSDL ? "libbc-" : "lib";
+    std::string OutputLib = D.GetTemporaryPath(
+        Twine(Prefix + Lib + "-" + Arch + "-" + Target).str(), "a");
+
+    C.addTempFile(C.getArgs().MakeArgString(OutputLib.c_str()));
+
+    ArgStringList CmdArgs;
+    SmallString<128> DeviceTriple;
+    DeviceTriple += Action::GetOffloadKindName(JA.getOffloadingDeviceKind());
+    DeviceTriple += '-';
+    std::string NormalizedTriple = T.getToolChain().getTriple().normalize();
+    DeviceTriple += NormalizedTriple;
+    if (!Target.empty()) {
+      DeviceTriple += '-';
+      DeviceTriple += Target;
+    }
+
+    std::string UnbundleArg("-unbundle");
+    std::string TypeArg("-type=a");
+    std::string InputArg("-inputs=" + ArchiveOfBundles);
+    std::string OffloadArg("-targets=" + std::string(DeviceTriple));
+    std::string OutputArg("-outputs=" + OutputLib);
+
+    const char *UBProgram = DriverArgs.MakeArgString(
+        T.getToolChain().GetProgramPath("clang-offload-bundler"));
+
+    ArgStringList UBArgs;
+    UBArgs.push_back(C.getArgs().MakeArgString(UnbundleArg.c_str()));
+    UBArgs.push_back(C.getArgs().MakeArgString(TypeArg.c_str()));
+    UBArgs.push_back(C.getArgs().MakeArgString(InputArg.c_str()));
+    UBArgs.push_back(C.getArgs().MakeArgString(OffloadArg.c_str()));
+    UBArgs.push_back(C.getArgs().MakeArgString(OutputArg.c_str()));
+
+    // Add this flag to not exit from clang-offload-bundler if no compatible
+    // code object is found in heterogenous archive library.
+    std::string AdditionalArgs("-allow-missing-bundles");
+    UBArgs.push_back(C.getArgs().MakeArgString(AdditionalArgs.c_str()));
+
+    C.addCommand(std::make_unique<Command>(
+        JA, T, ResponseFileSupport::AtFileCurCP(), UBProgram, UBArgs, Inputs,
+        InputInfo(&JA, C.getArgs().MakeArgString(OutputLib.c_str()))));
+    if (postClangLink)
+      CC1Args.push_back("-mlink-builtin-bitcode");
+
+    CC1Args.push_back(DriverArgs.MakeArgString(OutputLib));
+    break;
+  }
+
+  return FoundAOB;
+}
+
+// Wrapper function used by driver for adding SDLs during link phase.
+void tools::AddStaticDeviceLibsLinking(Compilation &C, const Tool &T,
+                                const JobAction &JA,
+                                const InputInfoList &Inputs,
+                                const llvm::opt::ArgList &DriverArgs,
+                                llvm::opt::ArgStringList &CC1Args,
+                                StringRef Arch, StringRef Target,
+                                bool isBitCodeSDL, bool postClangLink) {
+  AddStaticDeviceLibs(&C, &T, &JA, &Inputs, C.getDriver(), DriverArgs, CC1Args,
+                      Arch, Target, isBitCodeSDL, postClangLink);
+}
+
+// Wrapper function used for post clang linking of bitcode SDLS for nvptx by
+// the CUDA toolchain.
+void tools::AddStaticDeviceLibsPostLinking(const Driver &D,
+                                const llvm::opt::ArgList &DriverArgs,
+                                llvm::opt::ArgStringList &CC1Args,
+                                StringRef Arch, StringRef Target,
+                                bool isBitCodeSDL, bool postClangLink) {
+  AddStaticDeviceLibs(nullptr, nullptr, nullptr, nullptr, D, DriverArgs,
+                      CC1Args, Arch, Target, isBitCodeSDL, postClangLink);
+}
+
+// User defined Static Device Libraries(SDLs) can be passed to clang for
+// offloading GPU compilers. Like static host libraries, the use of a SDL is
+// specified with the -l command line option. The primary 
diff erence between
+// host and SDLs is the filenames for SDLs (refer SEARCH-ORDER for Bitcode SDLs
+// and SEARCH-ORDER for Machine-code SDLs for the naming convention).
+// SDLs are of following types:
+//
+// * Bitcode SDLs: They can either be a *.bc file or an archive of *.bc files.
+//           For NVPTX, these libraries are post-clang linked following each
+//           compilation. For AMDGPU, these libraries are linked one time
+//           during the application link phase.
+//
+// * Machine-code SDLs: They are archive files. For NVPTX, the archive members
+//           contain cubin for Nvidia GPUs and are linked one time during the
+//           link phase by the CUDA SDK linker called nvlink.	For AMDGPU, the
+//           process for machine code SDLs is still in development. But they
+//           will be linked by the LLVM tool lld.
+//
+// * Bundled objects that contain both host and device codes: Bundled objects
+//           may also contain library code compiled from source. For NVPTX, the
+//           bundle contains cubin. For AMDGPU, the bundle contains bitcode.
+//
+// For Bitcode and Machine-code SDLs, current compiler toolchains hardcode the
+// inclusion of specific SDLs such as math libraries and the OpenMP device
+// library libomptarget.
+void tools::AddStaticDeviceLibs(Compilation *C, const Tool *T,
+                                const JobAction *JA,
+                                const InputInfoList *Inputs, const Driver &D,
+                                const llvm::opt::ArgList &DriverArgs,
+                                llvm::opt::ArgStringList &CC1Args,
+                                StringRef Arch, StringRef Target,
+                                bool isBitCodeSDL, bool postClangLink) {
+
+  SmallVector<std::string, 8> LibraryPaths;
+  // Add search directories from LIBRARY_PATH env variable
+  llvm::Optional<std::string> LibPath =
+      llvm::sys::Process::GetEnv("LIBRARY_PATH");
+  if (LibPath) {
+    SmallVector<StringRef, 8> Frags;
+    const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
+    llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
+    for (StringRef Path : Frags)
+      LibraryPaths.emplace_back(Path.trim());
+  }
+
+  // Add directories from user-specified -L options
+  for (std::string Search_Dir : DriverArgs.getAllArgValues(options::OPT_L))
+    LibraryPaths.emplace_back(Search_Dir);
+
+  // Add path to lib-debug folders
+  SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(D.Dir);
+  llvm::sys::path::append(DefaultLibPath, Twine("lib") + CLANG_LIBDIR_SUFFIX);
+  LibraryPaths.emplace_back(DefaultLibPath.c_str());
+
+  // Build list of Static Device Libraries SDLs specified by -l option
+  llvm::SmallSet<std::string, 16> SDLNames;
+  static const StringRef HostOnlyArchives[] = {
+      "omp", "cudart", "m", "gcc", "gcc_s", "pthread", "hip_hcc"};
+  for (auto SDLName : DriverArgs.getAllArgValues(options::OPT_l)) {
+    if (!HostOnlyArchives->contains(SDLName)) {
+      SDLNames.insert(SDLName);
+    }
+  }
+
+  // The search stops as soon as an SDL file is found. The driver then provides
+  // the full filename of the SDL to the llvm-link or clang-nvlink-wrapper
+  // command. If no SDL is found after searching each LINKPATH with
+  // SEARCH-ORDER, it is possible that an archive file lib<libname>.a exists
+  // and may contain bundled object files.
+  for (auto SDLName : SDLNames) {
+    // This is the only call to SDLSearch
+    if (!SDLSearch(D, DriverArgs, CC1Args, LibraryPaths, SDLName, Arch, Target,
+                   isBitCodeSDL, postClangLink)) {
+      GetSDLFromOffloadArchive(*C, D, *T, *JA, *Inputs, DriverArgs, CC1Args,
+                               LibraryPaths, SDLName, Arch, Target,
+                               isBitCodeSDL, postClangLink);
+    }
+  }
+}
+
 static llvm::opt::Arg *
 getAMDGPUCodeObjectArgument(const Driver &D, const llvm::opt::ArgList &Args) {
   // The last of -mcode-object-v3, -mno-code-object-v3 and

diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index 8e48f3e7a5c32..00291a3681c80 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -49,6 +49,39 @@ void AddRunTimeLibs(const ToolChain &TC, const Driver &D,
                     llvm::opt::ArgStringList &CmdArgs,
                     const llvm::opt::ArgList &Args);
 
+void AddStaticDeviceLibsLinking(Compilation &C, const Tool &T,
+                                const JobAction &JA,
+                                const InputInfoList &Inputs,
+                                const llvm::opt::ArgList &DriverArgs,
+                                llvm::opt::ArgStringList &CmdArgs,
+                                StringRef Arch, StringRef Target,
+                                bool isBitCodeSDL, bool postClangLink);
+void AddStaticDeviceLibsPostLinking(const Driver &D,
+                                    const llvm::opt::ArgList &DriverArgs,
+                                    llvm::opt::ArgStringList &CmdArgs,
+                                    StringRef Arch, StringRef Target,
+                                    bool isBitCodeSDL, bool postClangLink);
+void AddStaticDeviceLibs(Compilation *C, const Tool *T, const JobAction *JA,
+                         const InputInfoList *Inputs, const Driver &D,
+                         const llvm::opt::ArgList &DriverArgs,
+                         llvm::opt::ArgStringList &CmdArgs, StringRef Arch,
+                         StringRef Target, bool isBitCodeSDL,
+                         bool postClangLink);
+
+bool SDLSearch(const Driver &D, const llvm::opt::ArgList &DriverArgs,
+               llvm::opt::ArgStringList &CmdArgs,
+               SmallVector<std::string, 8> LibraryPaths, std::string Lib,
+               StringRef Arch, StringRef Target, bool isBitCodeSDL,
+               bool postClangLink);
+
+bool GetSDLFromOffloadArchive(Compilation &C, const Driver &D, const Tool &T,
+                              const JobAction &JA, const InputInfoList &Inputs,
+                              const llvm::opt::ArgList &DriverArgs,
+                              llvm::opt::ArgStringList &CC1Args,
+                              SmallVector<std::string, 8> LibraryPaths,
+                              StringRef Lib, StringRef Arch, StringRef Target,
+                              bool isBitCodeSDL, bool postClangLink);
+
 const char *SplitDebugName(const JobAction &JA, const llvm::opt::ArgList &Args,
                            const InputInfo &Input, const InputInfo &Output);
 

diff  --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index e4a6fb8d7f2ba..18351dae39f7e 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -610,8 +610,11 @@ void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(CubinF);
   }
 
+  AddStaticDeviceLibsLinking(C, *this, JA, Inputs, Args, CmdArgs, "nvptx", GPUArch,
+                      false, false);
+
   const char *Exec =
-      Args.MakeArgString(getToolChain().GetProgramPath("nvlink"));
+      Args.MakeArgString(getToolChain().GetProgramPath("clang-nvlink-wrapper"));
   C.addCommand(std::make_unique<Command>(
       JA, *this,
       ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
@@ -741,6 +744,8 @@ void CudaToolChain::addClangTargetOptions(
 
     addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
                        getTriple());
+    AddStaticDeviceLibsPostLinking(getDriver(), DriverArgs, CC1Args, "nvptx", GpuArch,
+                        /* bitcode SDL?*/ true, /* PostClang Link? */ true);
   }
 }
 

diff  --git a/clang/test/Driver/Inputs/openmp_static_device_link/libFatArchive.a b/clang/test/Driver/Inputs/openmp_static_device_link/libFatArchive.a
new file mode 100644
index 0000000000000..ebd7e55898b3a
Binary files /dev/null and b/clang/test/Driver/Inputs/openmp_static_device_link/libFatArchive.a 
diff er

diff  --git a/clang/test/Driver/fat_archive_amdgpu.cpp b/clang/test/Driver/fat_archive_amdgpu.cpp
new file mode 100644
index 0000000000000..b64ba8b97478c
--- /dev/null
+++ b/clang/test/Driver/fat_archive_amdgpu.cpp
@@ -0,0 +1,81 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// See the steps to create a fat archive are given at the end of the file.
+
+// Given a FatArchive, clang-offload-bundler should be called to create a
+// device specific archive, which should be passed to llvm-link.
+// RUN: %clang -O2 -### -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s -L%S/Inputs/openmp_static_device_link -lFatArchive 2>&1 | FileCheck %s
+// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm-bc"{{.*}}"-target-cpu" "[[GPU:gfx[0-9]+]]"{{.*}}"-o" "[[HOSTBC:.*.bc]]" "-x" "c++"{{.*}}.cpp
+// CHECK: clang-offload-bundler" "-unbundle" "-type=a" "-inputs={{.*}}/Inputs/openmp_static_device_link/libFatArchive.a" "-targets=openmp-amdgcn-amd-amdhsa-[[GPU]]" "-outputs=[[DEVICESPECIFICARCHIVE:.*.a]]" "-allow-missing-bundles"
+// CHECK: llvm-link{{.*}}"[[HOSTBC]]" "[[DEVICESPECIFICARCHIVE]]" "-o" "{{.*}}-[[GPU]]-linked-{{.*}}.bc"
+// CHECK: ld"{{.*}}" "-L{{.*}}/Inputs/openmp_static_device_link" "{{.*}} "-lFatArchive" "{{.*}}" "-lomp{{.*}}-lomptarget"
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#define N 10
+
+#pragma omp declare target
+// Functions defined in Fat Archive.
+extern "C" void func_present(float *, float *, unsigned);
+
+#ifdef MISSING
+// Function not defined in the fat archive.
+extern "C" void func_missing(float *, float *, unsigned);
+#endif
+
+#pragma omp end declare target
+
+int main() {
+  float in[N], out[N], sum = 0;
+  unsigned i;
+
+#pragma omp parallel for
+  for (i = 0; i < N; ++i) {
+    in[i] = i;
+  }
+
+  func_present(in, out, N); // Returns out[i] = a[i] * 0
+
+#ifdef MISSING
+  func_missing(in, out, N); // Should throw an error here
+#endif
+
+#pragma omp parallel for reduction(+ \
+                                   : sum)
+  for (i = 0; i < N; ++i)
+    sum += out[i];
+
+  if (!sum)
+    return 0;
+  return sum;
+}
+
+#endif
+
+/***********************************************
+   Steps to create Fat Archive (libFatArchive.a)
+************************************************
+***************** File: func_1.c ***************
+void func_present(float* in, float* out, unsigned n){
+  unsigned i;
+  #pragma omp target teams distribute parallel for map(to: in[0:n]) map(from: out[0:n])
+  for(i=0; i<n; ++i){
+    out[i] = in[i] * 0;
+  }
+}
+*************************************************
+1. Compile source file(s) to generate object file(s)
+    clang -O2 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 -c func_1.c -o func_1_gfx906.o
+    clang -O2 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 -c func_1.c -o func_1_gfx908.o
+    clang -O2 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 -c func_2.c -o func_2_gfx906.o
+    clang -O2 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 -c func_2.c -o func_2_gfx908.o
+    clang -O2 -fopenmp -fopenmp-targets=nvptx64 -c func_1.c -o func_1_nvptx.o
+    clang -O2 -fopenmp -fopenmp-targets=nvptx64 -c func_2.c -o func_2_nvptx.o
+
+2. Create a fat archive by combining all the object file(s)
+    llvm-ar cr libFatArchive.a func_1_gfx906.o func_1_gfx908.o func_2_gfx906.o func_2_gfx908.o func_1_nvptx.o func_2_nvptx.o
+************************************************/

diff  --git a/clang/test/Driver/fat_archive_nvptx.cpp b/clang/test/Driver/fat_archive_nvptx.cpp
new file mode 100644
index 0000000000000..72e20d00651e7
--- /dev/null
+++ b/clang/test/Driver/fat_archive_nvptx.cpp
@@ -0,0 +1,81 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// See the steps to create a fat archive are given at the end of the file.
+
+// Given a FatArchive, clang-offload-bundler should be called to create a
+// device specific archive, which should be passed to clang-nvlink-wrapper.
+// RUN: %clang -O2 -### -fopenmp -fopenmp-targets=nvptx64 %s -L%S/Inputs/openmp_static_device_link -lFatArchive 2>&1 | FileCheck %s
+// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "nvptx64"{{.*}}"-target-cpu" "[[GPU:sm_[0-9]+]]"{{.*}}"-o" "[[HOSTBC:.*.s]]" "-x" "c++"{{.*}}.cpp
+// CHECK: clang-offload-bundler" "-unbundle" "-type=a" "-inputs={{.*}}/Inputs/openmp_static_device_link/libFatArchive.a" "-targets=openmp-nvptx64-[[GPU]]" "-outputs=[[DEVICESPECIFICARCHIVE:.*.a]]" "-allow-missing-bundles"
+// CHECK: clang-nvlink-wrapper{{.*}}"-o" "{{.*}}.out" "-arch" "[[GPU]]" "{{.*}}[[DEVICESPECIFICARCHIVE]]"
+// CHECK: ld"{{.*}}" "-L{{.*}}/Inputs/openmp_static_device_link" "{{.*}} "-lFatArchive" "{{.*}}" "-lomp{{.*}}-lomptarget"
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#define N 10
+
+#pragma omp declare target
+// Functions defined in Fat Archive.
+extern "C" void func_present(float *, float *, unsigned);
+
+#ifdef MISSING
+// Function not defined in the fat archive.
+extern "C" void func_missing(float *, float *, unsigned);
+#endif
+
+#pragma omp end declare target
+
+int main() {
+  float in[N], out[N], sum = 0;
+  unsigned i;
+
+#pragma omp parallel for
+  for (i = 0; i < N; ++i) {
+    in[i] = i;
+  }
+
+  func_present(in, out, N); // Returns out[i] = a[i] * 0
+
+#ifdef MISSING
+  func_missing(in, out, N); // Should throw an error here
+#endif
+
+#pragma omp parallel for reduction(+ \
+                                   : sum)
+  for (i = 0; i < N; ++i)
+    sum += out[i];
+
+  if (!sum)
+    return 0;
+  return sum;
+}
+
+#endif
+
+/***********************************************
+   Steps to create Fat Archive (libFatArchive.a)
+************************************************
+***************** File: func_1.c ***************
+void func_present(float* in, float* out, unsigned n){
+  unsigned i;
+  #pragma omp target teams distribute parallel for map(to: in[0:n]) map(from: out[0:n])
+  for(i=0; i<n; ++i){
+    out[i] = in[i] * 0;
+  }
+}
+*************************************************
+1. Compile source file(s) to generate object file(s)
+    clang -O2 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 -c func_1.c -o func_1_gfx906.o
+    clang -O2 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 -c func_1.c -o func_1_gfx908.o
+    clang -O2 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 -c func_2.c -o func_2_gfx906.o
+    clang -O2 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 -c func_2.c -o func_2_gfx908.o
+    clang -O2 -fopenmp -fopenmp-targets=nvptx64 -c func_1.c -o func_1_nvptx.o
+    clang -O2 -fopenmp -fopenmp-targets=nvptx64 -c func_2.c -o func_2_nvptx.o
+
+2. Create a fat archive by combining all the object file(s)
+    llvm-ar cr libFatArchive.a func_1_gfx906.o func_1_gfx908.o func_2_gfx906.o func_2_gfx908.o func_1_nvptx.o func_2_nvptx.o
+************************************************/

diff  --git a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
index 522b5d33341fb..0dbb75f67b289 100644
--- a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
+++ b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
@@ -180,6 +180,28 @@ struct OffloadTargetInfo {
   }
 };
 
+static StringRef getDeviceFileExtension(StringRef Device) {
+  if (Device.contains("gfx"))
+    return ".bc";
+  if (Device.contains("sm_"))
+    return ".cubin";
+
+  WithColor::warning() << "Could not determine extension for archive"
+                          "members, using \".o\"\n";
+  return ".o";
+}
+
+static std::string getDeviceLibraryFileName(StringRef BundleFileName,
+                                            StringRef Device) {
+  StringRef LibName = sys::path::stem(BundleFileName);
+  StringRef Extension = getDeviceFileExtension(Device);
+
+  std::string Result;
+  Result += LibName;
+  Result += Extension;
+  return Result;
+}
+
 /// Generic file handler interface.
 class FileHandler {
 public:
@@ -1229,7 +1251,9 @@ static Error UnbundleArchive() {
           BundledObjectFileName.assign(BundledObjectFile);
           auto OutputBundleName =
               Twine(llvm::sys::path::stem(BundledObjectFileName) + "-" +
-                    CodeObject)
+                    CodeObject +
+                    getDeviceLibraryFileName(BundledObjectFileName,
+                                             CodeObjectInfo.GPUArch))
                   .str();
           // Replace ':' in optional target feature list with '_' to ensure
           // cross-platform validity.