[clang] d9c64d3 - [OpenMP] Allow CUDA to be linked with OpenMP using the new driver

Fri Apr 29 08:38:48 PDT 2022

Author: Joseph Huber
Date: 2022-04-29T11:38:40-04:00
New Revision: d9c64d33b98be695fc78a65624242033058ed117

URL: https://github.com/llvm/llvm-project/commit/d9c64d33b98be695fc78a65624242033058ed117
DIFF: https://github.com/llvm/llvm-project/commit/d9c64d33b98be695fc78a65624242033058ed117.diff

LOG: [OpenMP] Allow CUDA to be linked with OpenMP using the new driver

After basic support for embedding and handling CUDA files was added to
the new driver, we should be able to call CUDA functions from OpenMP
code. This patch makes the necessary changes to successfuly link in CUDA
programs that were compiled using the new driver. With this patch it
should be possible to compile device-only CUDA code (no kernels) and
call it from OpenMP as follows:

```
$ clang++ cuda.cu -fopenmp-new-driver -offload-arch=sm_70 -c
$ clang++ openmp.cpp cuda.o -fopenmp-new-driver -fopenmp -fopenmp-targets=nvptx64 -Xopenmp-target=nvptx64 -march=sm_70
```

Currently this requires using a host variant to suppress the generation
of a CPU-side fallback call.

Depends on D120272

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D120273

Added: 
    

Modified: 
    clang/test/Driver/linker-wrapper.c
    clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Removed: 
    


################################################################################
diff  --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 7920fe8c1a99..a180a792284e 100644

--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -40,3 +40,11 @@
 
 // LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s
 // LTO-NOT: nvlink
+
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
+// RUN:   -fembed-offload-object=%S/Inputs/dummy-elf.o,openmp,nvptx64-nvida-cuda,sm_70 \
+// RUN:   -fembed-offload-object=%S/Inputs/dummy-elf.o,cuda,nvptx64-nvida-cuda,sm_70
+// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \
+// RUN:   /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA_OMP_LINK
+
+// CUDA_OMP_LINK: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o {{.*}}.o

diff  --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 09381d93f17d..c2c1fa738be6 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -160,6 +160,10 @@ static codegen::RegisterCodeGenFlags CodeGenFlags;
 /// section will contain one or more offloading binaries stored contiguously.
 #define OFFLOAD_SECTION_MAGIC_STR ".llvm.offloading"
 
+/// The magic offset for the first object inside CUDA's fatbinary. This can be
+/// 
diff erent but it should work for what is passed here.
+static constexpr unsigned FatbinaryOffset = 0x50;
+
 /// Information for a device offloading file extracted from the host.
 struct DeviceFile {
   DeviceFile(StringRef Kind, StringRef TheTriple, StringRef Arch,
@@ -173,7 +177,10 @@ struct DeviceFile {
 };
 
 namespace llvm {
-/// Helper that allows DeviceFile to be used as a key in a DenseMap.
+/// Helper that allows DeviceFile to be used as a key in a DenseMap. For now we
+/// assume device files with matching architectures and triples but 
diff erent
+/// offloading kinds should be handlded together, this may not be true in the
+/// future.
 template <> struct DenseMapInfo<DeviceFile> {
   static DeviceFile getEmptyKey() {
     return {DenseMapInfo<StringRef>::getEmptyKey(),
@@ -953,13 +960,37 @@ Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
         MemoryBuffer::getFileOrSTDIN(File);
     if (std::error_code EC = BufferOrErr.getError())
       return createFileError(File, EC);
+    MemoryBufferRef Buffer = **BufferOrErr;
 
     file_magic Type = identify_magic((*BufferOrErr)->getBuffer());
-    if (Type != file_magic::bitcode) {
+    switch (Type) {
+    case file_magic::bitcode: {
+      Expected<std::unique_ptr<lto::InputFile>> InputFileOrErr =
+          llvm::lto::InputFile::create(Buffer);
+      if (!InputFileOrErr)
+        return InputFileOrErr.takeError();
+
+      // Save the input file and the buffer associated with its memory.
+      BitcodeFiles.push_back(std::move(*InputFileOrErr));
+      SavedBuffers.push_back(std::move(*BufferOrErr));
+      continue;
+    }
+    case file_magic::cuda_fatbinary: {
+      // Cuda fatbinaries made by Clang almost almost have an object eighty
+      // bytes from the beginning. This should be sufficient to identify the
+      // symbols.
+      Buffer = MemoryBufferRef(
+          (*BufferOrErr)->getBuffer().drop_front(FatbinaryOffset), "FatBinary");
+      LLVM_FALLTHROUGH;
+    }
+    case file_magic::elf_relocatable:
+    case file_magic::elf_shared_object:
+    case file_magic::macho_object:
+    case file_magic::coff_object: {
       Expected<std::unique_ptr<ObjectFile>> ObjFile =
-          ObjectFile::createObjectFile(**BufferOrErr, Type);
+          ObjectFile::createObjectFile(Buffer);
       if (!ObjFile)
-        return ObjFile.takeError();
+        continue;
 
       NewInputFiles.push_back(File.str());
       for (auto &Sym : (*ObjFile)->symbols()) {
@@ -973,15 +1004,10 @@ Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
         else
           UsedInSharedLib.insert(Saver.save(*Name));
       }
-    } else {
-      Expected<std::unique_ptr<lto::InputFile>> InputFileOrErr =
-          llvm::lto::InputFile::create(**BufferOrErr);
-      if (!InputFileOrErr)
-        return InputFileOrErr.takeError();
-
-      // Save the input file and the buffer associated with its memory.
-      BitcodeFiles.push_back(std::move(*InputFileOrErr));
-      SavedBuffers.push_back(std::move(*BufferOrErr));
+      continue;
+    }
+    default:
+      continue;
     }
   }