[clang] [LinkerWrapper] Extract device archives if symbol is used on the host (PR #111921)

Joseph Huber via cfe-commits cfe-commits at lists.llvm.org
Mon Oct 28 07:18:38 PDT 2024


https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/111921

>From 098e554223f843193ee142ea5f9b0a556477d309 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 10 Oct 2024 18:26:27 -0500
Subject: [PATCH] [LinkerWrapper] Extract device archives if symbol is used on
 the host

Summary:
Standard archive linking semantics dictate that archive members are only
extracted if they are used, i.e. define a currently undefined symbol.
This poses some issues for offloading langues because there can be a
kernel called `foo` in a device archive. The CPU archive is extracted
but `foo` is never referenced on the device so it is not extracted.

This patch augments this behavior to consider all symbols used by the
offloading toolchain as 'undefined' so they will extract archives. We do
this by getting a list of all the kernel names the runtime will call
from the offloading entires list.

Not that this does not bother checking if the *host* side will actually
use the archive, so the case where a kernel is not used by the static
library will still cause it to extract. I could fix this but it felt
like a lot of effort for little gain.

Previously this was solved by forcibly extracting anything with public
visibility. This means that we can now link device code with externally
visible symbols and don't need to worry about it always extracting as
well.

Depends on https://github.com/llvm/llvm-project/pull/111890
---
 clang/test/Driver/linker-wrapper-libs.c       |  47 -------
 clang/test/Driver/linker-wrapper.c            |   7 +-
 .../ClangLinkerWrapper.cpp                    | 123 +++++++++++++++---
 3 files changed, 107 insertions(+), 70 deletions(-)

diff --git a/clang/test/Driver/linker-wrapper-libs.c b/clang/test/Driver/linker-wrapper-libs.c
index cb5c7c137a0bab..2adf4cc83c5d31 100644
--- a/clang/test/Driver/linker-wrapper-libs.c
+++ b/clang/test/Driver/linker-wrapper-libs.c
@@ -51,30 +51,6 @@ int bar() { return weak; }
 // LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
 // LIBRARY-RESOLVES: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
 
-//
-// Check that we extract a static library that defines a global visibile to the
-// host.
-//
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DGLOBAL -o %t.nvptx.global.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DGLOBAL -o %t.amdgpu.global.bc
-// RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.nvptx.global.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.amdgpu.global.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.o %t.a -o a.out 2>&1 \
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.a %t.o -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL
-
-// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
-// LIBRARY-GLOBAL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
-
 //
 // Check that we do not extract a global symbol if the source file was not
 // created by an offloading language that expects there to be a host version of
@@ -142,29 +118,6 @@ int bar() { return weak; }
 // LIBRARY-HIDDEN-NOT: {{.*}}.o {{.*}}.o
 // LIBRARY-HIDDEN: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030
 
-//
-// Check that we do not extract a static library that defines a global visibile
-// to the host that is already defined.
-//
-// RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -DGLOBAL -o %t.nvptx.global.bc
-// RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -DGLOBAL -o %t.amdgpu.global.bc
-// RUN: clang-offload-packager -o %t-lib.out \
-// RUN:   --image=file=%t.nvptx.global.bc,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.amdgpu.global.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
-// RUN: llvm-ar rcs %t.a %t.o
-// RUN: clang-offload-packager -o %t.out \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
-// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030
-// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
-// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t.o %t.a %t.a -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=LIBRARY-GLOBAL-DEFINED
-
-// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o
-// LIBRARY-GLOBAL-DEFINED-NOT: {{.*}}gfx1030{{.*}}.o
-// LIBRARY-GLOBAL-DEFINED: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}.o {{.*}}.o
-
 //
 // Check that we can use --[no-]whole-archive to control extraction.
 //
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 068ea2d7d3c663..7982f3d046861a 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -4,9 +4,6 @@
 
 // REQUIRES: system-linux
 
-// An externally visible variable so static libraries extract.
-__attribute__((visibility("protected"), used)) int x;
-
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o
 // RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.nvptx.bc
 // RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
@@ -169,7 +166,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a:xnack-
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-off.o -fembed-offload-object=%t-off.out
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t-on.o %t-off.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMD-TARGET-ID
+// RUN:   --linker-path=/usr/bin/ld %t-on.o %t-off.o --whole-archive %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMD-TARGET-ID
 
 // AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o
 // AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o
@@ -185,7 +182,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t2.o -fembed-offload-object=%t2.out
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
-// RUN:   --linker-path=/usr/bin/ld %t1.o %t2.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=ARCH-ALL
+// RUN:   --linker-path=/usr/bin/ld %t1.o %t2.o --whole-archive %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=ARCH-ALL
 
 // ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o
 // ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 9fea1fdcd5fb46..bf5b792d9e3e07 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1477,14 +1477,7 @@ Expected<bool> getSymbolsFromBitcode(MemoryBufferRef Buffer, OffloadKind Kind,
       bool ResolvesStrongReference =
           ((OldSym & Sym_Undefined && !(OldSym & Sym_Weak)) &&
            !Sym.isUndefined());
-      // We will extract if it defines a new global symbol visible to the
-      // host. This is only necessary for code targeting an offloading
-      // language.
-      bool NewGlobalSymbol =
-          ((NewSymbol || (OldSym & Sym_Undefined)) && !Sym.isUndefined() &&
-           !Sym.canBeOmittedFromSymbolTable() && Kind != object::OFK_None &&
-           (Sym.getVisibility() != GlobalValue::HiddenVisibility));
-      ShouldExtract |= ResolvesStrongReference | NewGlobalSymbol;
+      ShouldExtract |= ResolvesStrongReference;
 
       // Update this symbol in the "table" with the new information.
       if (OldSym & Sym_Undefined && !Sym.isUndefined())
@@ -1533,15 +1526,7 @@ Expected<bool> getSymbolsFromObject(const ObjectFile &Obj, OffloadKind Kind,
     bool ResolvesStrongReference = (OldSym & Sym_Undefined) &&
                                    !(OldSym & Sym_Weak) &&
                                    !(*FlagsOrErr & SymbolRef::SF_Undefined);
-
-    // We will extract if it defines a new global symbol visible to the
-    // host. This is only necessary for code targeting an offloading
-    // language.
-    bool NewGlobalSymbol =
-        ((NewSymbol || (OldSym & Sym_Undefined)) &&
-         !(*FlagsOrErr & SymbolRef::SF_Undefined) && Kind != object::OFK_None &&
-         !(*FlagsOrErr & SymbolRef::SF_Hidden));
-    ShouldExtract |= ResolvesStrongReference | NewGlobalSymbol;
+    ShouldExtract |= ResolvesStrongReference;
 
     // Update this symbol in the "table" with the new information.
     if (OldSym & Sym_Undefined && !(*FlagsOrErr & SymbolRef::SF_Undefined))
@@ -1586,10 +1571,101 @@ Expected<bool> getSymbols(StringRef Image, OffloadKind Kind, bool IsArchive,
   }
 }
 
+Error getNeededDeviceGlobalsFromBitcode(MemoryBufferRef Buffer,
+                                        SmallVectorImpl<std::string> &Symbols) {
+
+  LLVMContext Context;
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = getLazyIRModule(
+      MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false), Err,
+      Context);
+  if (!M)
+    return createStringError(inconvertibleErrorCode(),
+                             "Failed to create module");
+
+  // Extract offloading data from globals referenced by the
+  // `llvm.embedded.object` metadata with the `.llvm.offloading` section.
+  auto *MD = M->getNamedMetadata("llvm.offloading.symbols");
+  if (!MD)
+    return Error::success();
+
+  for (const MDNode *Op : MD->operands()) {
+    GlobalVariable *GV =
+        mdconst::dyn_extract_or_null<GlobalVariable>(Op->getOperand(0));
+    if (!GV)
+      continue;
+
+    auto *CDS = dyn_cast<ConstantDataSequential>(GV->getInitializer());
+    if (!CDS)
+      continue;
+
+    Symbols.emplace_back(CDS->getAsCString().str());
+  }
+
+  return Error::success();
+}
+
+Error getNeededDeviceGlobalsFromObject(const ObjectFile &Obj,
+                                       SmallVectorImpl<std::string> &Symbols) {
+  for (const auto &Sec : Obj.sections()) {
+    auto NameOrErr = Sec.getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+
+    if (*NameOrErr != ".llvm.rodata.offloading")
+      continue;
+
+    auto ContentsOrErr = Sec.getContents();
+    if (!ContentsOrErr)
+      return ContentsOrErr.takeError();
+    llvm::transform(llvm::split(ContentsOrErr->drop_back(), '\0'),
+                    std::back_inserter(Symbols),
+                    [](StringRef Str) { return Str.str(); });
+  }
+  return Error::success();
+}
+
+Error getNeededDeviceGlobals(MemoryBufferRef Buffer,
+                             SmallVectorImpl<std::string> &Symbols) {
+  file_magic Type = identify_magic(Buffer.getBuffer());
+  switch (Type) {
+  case file_magic::bitcode:
+    return getNeededDeviceGlobalsFromBitcode(Buffer, Symbols);
+  case file_magic::elf_relocatable:
+  case file_magic::coff_object: {
+    Expected<std::unique_ptr<ObjectFile>> ObjFile =
+        ObjectFile::createObjectFile(Buffer, Type);
+    if (!ObjFile)
+      return ObjFile.takeError();
+    return getNeededDeviceGlobalsFromObject(*ObjFile->get(), Symbols);
+  }
+  case file_magic::archive: {
+    Expected<std::unique_ptr<llvm::object::Archive>> LibFile =
+        object::Archive::create(Buffer);
+    if (!LibFile)
+      return LibFile.takeError();
+    Error Err = Error::success();
+    for (auto Child : (*LibFile)->children(Err)) {
+      auto ChildBufferOrErr = Child.getMemoryBufferRef();
+      if (!ChildBufferOrErr)
+        return ChildBufferOrErr.takeError();
+      if (Error Err = getNeededDeviceGlobals(*ChildBufferOrErr, Symbols))
+        return Err;
+    }
+    if (Err)
+      return Err;
+    return Error::success();
+  }
+  default:
+    return Error::success();
+  }
+}
+
 /// Search the input files and libraries for embedded device offloading code
 /// and add it to the list of files to be linked. Files coming from static
 /// libraries are only added to the input if they are used by an existing
-/// input file. Returns a list of input files intended for a single linking job.
+/// input file. Returns a list of input files intended for a single linking
+/// job.
 Expected<SmallVector<SmallVector<OffloadFile>>>
 getDeviceInput(const ArgList &Args) {
   llvm::TimeTraceScope TimeScope("ExtractDeviceCode");
@@ -1610,6 +1686,7 @@ getDeviceInput(const ArgList &Args) {
   bool WholeArchive = Args.hasArg(OPT_wholearchive_flag) ? true : false;
   SmallVector<OffloadFile> ObjectFilesToExtract;
   SmallVector<OffloadFile> ArchiveFilesToExtract;
+  SmallVector<std::string> NeededSymbols;
   for (const opt::Arg *Arg : Args.filtered(
            OPT_INPUT, OPT_library, OPT_whole_archive, OPT_no_whole_archive)) {
     if (Arg->getOption().matches(OPT_whole_archive) ||
@@ -1640,6 +1717,9 @@ getDeviceInput(const ArgList &Args) {
     if (identify_magic(Buffer.getBuffer()) == file_magic::elf_shared_object)
       continue;
 
+    if (Error Err = getNeededDeviceGlobals(Buffer, NeededSymbols))
+      return std::move(Err);
+
     SmallVector<OffloadFile> Binaries;
     if (Error Err = extractOffloadBinaries(Buffer, Binaries))
       return std::move(Err);
@@ -1666,6 +1746,13 @@ getDeviceInput(const ArgList &Args) {
         CompatibleTargets.emplace_back(ID);
 
     for (const auto &[Index, ID] : llvm::enumerate(CompatibleTargets)) {
+      // Initialize the symbol table with the device globals that host needs.
+      // This will force them to be extracted if they are defined in a library.
+      if (!Syms.contains(ID)) {
+        for (auto &Symbol : NeededSymbols)
+          auto &Val = Syms[ID][Symbol] = Sym_Undefined;
+      }
+
       Expected<bool> ExtractOrErr = getSymbols(
           Binary.getBinary()->getImage(), Binary.getBinary()->getOffloadKind(),
           /*IsArchive=*/false, Saver, Syms[ID]);



More information about the cfe-commits mailing list