[lld] [ELF] Fix unnecessary inclusion of unreferenced provide symbols (PR #84512)

via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 25 15:57:18 PDT 2024


https://github.com/partaror updated https://github.com/llvm/llvm-project/pull/84512

>From 65fd9d73f798919ec6143c031066059823d1e5e1 Mon Sep 17 00:00:00 2001
From: Parth Arora <quic_partaror at quicinc.com>
Date: Fri, 8 Mar 2024 18:45:02 +0530
Subject: [PATCH] [ELF] Fix unnecessary inclusion of unreferenced provide
 symbols

Previously, linker was unnecessarily including a PROVIDE symbol which
was referenced by another unused PROVIDE symbol. For example, if a
linker script contained the below code and 'not_used_sym' provide symbol
is not included, then linker was still unnecessarily including 'foo' PROVIDE
symbol because it was referenced by 'not_used_sym'. This commit fixes
this behavior.

PROVIDE(not_used_sym = foo)
PROVIDE(foo = 0x1000)

This commit fixes this behavior by using dfs-like algorithm to find
all the symbols referenced in provide expressions of included provide
symbols.

This commit also fixes the issue of unused section not being garbage-collected
if a symbol of the section is referenced by an unused PROVIDE symbol.

Closes #74771
Closes #84730

Co-authored-by: Fangrui Song <i at maskray.me>
---
 lld/ELF/Driver.cpp                           | 22 ++-----
 lld/ELF/LinkerScript.cpp                     | 48 +++++++++++++---
 lld/ELF/LinkerScript.h                       | 21 +++++++
 lld/ELF/ScriptParser.cpp                     | 13 ++++-
 lld/ELF/SymbolTable.cpp                      |  4 ++
 lld/ELF/SymbolTable.h                        |  3 +
 lld/test/ELF/gc-sections-with-provide.s      | 60 ++++++++++++++++++++
 lld/test/ELF/linkerscript/symbolreferenced.s | 51 ++++++++++++++---
 8 files changed, 188 insertions(+), 34 deletions(-)
 create mode 100644 lld/test/ELF/gc-sections-with-provide.s

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 7257ebd0fac994..c66317f5e8479a 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2393,12 +2393,6 @@ static void readSymbolPartitionSection(InputSectionBase *s) {
   sym->partition = newPart.getNumber();
 }
 
-static Symbol *addUnusedUndefined(StringRef name,
-                                  uint8_t binding = STB_GLOBAL) {
-  return symtab.addSymbol(
-      Undefined{ctx.internalFile, name, binding, STV_DEFAULT, 0});
-}
-
 static void markBuffersAsDontNeed(bool skipLinkedOutput) {
   // With --thinlto-index-only, all buffers are nearly unused from now on
   // (except symbol/section names used by infrequent passes). Mark input file
@@ -2485,15 +2479,15 @@ static std::vector<WrappedSymbol> addWrappedSymbols(opt::InputArgList &args) {
       continue;
 
     Symbol *wrap =
-        addUnusedUndefined(saver().save("__wrap_" + name), sym->binding);
+        symtab.addUnusedUndefined(saver().save("__wrap_" + name), sym->binding);
 
     // If __real_ is referenced, pull in the symbol if it is lazy. Do this after
     // processing __wrap_ as that may have referenced __real_.
     StringRef realName = saver().save("__real_" + name);
     if (symtab.find(realName))
-      addUnusedUndefined(name, sym->binding);
+      symtab.addUnusedUndefined(name, sym->binding);
 
-    Symbol *real = addUnusedUndefined(realName);
+    Symbol *real = symtab.addUnusedUndefined(realName);
     v.push_back({sym, real, wrap});
 
     // We want to tell LTO not to inline symbols to be overwritten
@@ -2723,7 +2717,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   // Handle -u/--undefined before input files. If both a.a and b.so define foo,
   // -u foo a.a b.so will extract a.a.
   for (StringRef name : config->undefined)
-    addUnusedUndefined(name)->referenced = true;
+    symtab.addUnusedUndefined(name)->referenced = true;
 
   // Add all files to the symbol table. This will add almost all
   // symbols that we need to the symbol table. This process might
@@ -2748,13 +2742,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   config->hasDynSymTab =
       !ctx.sharedFiles.empty() || config->isPic || config->exportDynamic;
 
-  // Some symbols (such as __ehdr_start) are defined lazily only when there
-  // are undefined symbols for them, so we add these to trigger that logic.
-  for (StringRef name : script->referencedSymbols) {
-    Symbol *sym = addUnusedUndefined(name);
-    sym->isUsedInRegularObj = true;
-    sym->referenced = true;
-  }
+  script->addScriptReferencedSymbolsToSymTable();
 
   // Prevent LTO from removing any definition referenced by -u.
   for (StringRef name : config->undefined)
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 3af09a32b65113..f815b3ac6feeda 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -198,15 +198,7 @@ static bool shouldDefineSym(SymbolAssignment *cmd) {
   if (cmd->name == ".")
     return false;
 
-  if (!cmd->provide)
-    return true;
-
-  // If a symbol was in PROVIDE(), we need to define it only
-  // when it is a referenced undefined symbol.
-  Symbol *b = symtab.find(cmd->name);
-  if (b && !b->isDefined() && !b->isCommon())
-    return true;
-  return false;
+  return !cmd->provide || LinkerScript::shouldAddProvideSym(cmd->name);
 }
 
 // Called by processSymbolAssignments() to assign definitions to
@@ -1517,3 +1509,41 @@ void LinkerScript::checkFinalScriptConditions() const {
       checkMemoryRegion(lmaRegion, sec, sec->getLMA());
   }
 }
+
+void LinkerScript::addScriptReferencedSymbolsToSymTable() {
+  // Some symbols (such as __ehdr_start) are defined lazily only when there
+  // are undefined symbols for them, so we add these to trigger that logic.
+  auto reference = [](StringRef name) {
+    Symbol *sym = symtab.addUnusedUndefined(name);
+    sym->isUsedInRegularObj = true;
+    sym->referenced = true;
+  };
+  for (StringRef name : referencedSymbols)
+    reference(name);
+
+  // Keeps track of references from which PROVIDE symbols have been added to the
+  // symbol table.
+  DenseSet<StringRef> added;
+  SmallVector<const SmallVector<StringRef, 0> *, 0> symRefsVec;
+  for (const auto &[name, symRefs] : provideMap)
+    if (LinkerScript::shouldAddProvideSym(name) && added.insert(name).second)
+      symRefsVec.push_back(&symRefs);
+  while (symRefsVec.size()) {
+    for (StringRef name : *symRefsVec.pop_back_val()) {
+      reference(name);
+      // Prevent the symbol from being discarded by --gc-sections.
+      script->referencedSymbols.push_back(name);
+      auto it = script->provideMap.find(name);
+      if (it != script->provideMap.end() &&
+          LinkerScript::shouldAddProvideSym(name) &&
+          added.insert(name).second) {
+        symRefsVec.push_back(&it->second);
+      }
+    }
+  }
+}
+
+bool LinkerScript::shouldAddProvideSym(StringRef symName) {
+  Symbol *sym = symtab.find(symName);
+  return sym && !sym->isDefined() && !sym->isCommon();
+}
diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h
index 18eaf58b785e37..fa7c6eb9c0d8f7 100644
--- a/lld/ELF/LinkerScript.h
+++ b/lld/ELF/LinkerScript.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include <cstddef>
@@ -348,6 +349,18 @@ class LinkerScript final {
   // Check backward location counter assignment and memory region/LMA overflows.
   void checkFinalScriptConditions() const;
 
+  // Add symbols that are referenced in the linker script to the symbol table.
+  // Symbols referenced in a PROVIDE command are only added to the symbol table
+  // if the PROVIDE command actually provides the symbol.
+  // It also adds the symbols referenced by the used PROVIDE symbols to the
+  // linker script referenced symbols list.
+  void addScriptReferencedSymbolsToSymTable();
+
+  // Returns true if the PROVIDE symbol should be added to the link.
+  // A PROVIDE symbol is added to the link only if it satisfies an
+  // undefined reference.
+  static bool shouldAddProvideSym(StringRef symName);
+
   // SECTIONS command list.
   SmallVector<SectionCommand *, 0> sectionCommands;
 
@@ -379,6 +392,14 @@ class LinkerScript final {
 
   // Sections that will be warned/errored by --orphan-handling.
   SmallVector<const InputSectionBase *, 0> orphanSections;
+
+  // Stores the mapping: PROVIDE symbol -> symbols referred in the PROVIDE
+  // expression. For example, if the PROVIDE command is:
+  //
+  // PROVIDE(v = a + b + c);
+  //
+  // then provideMap should contain the mapping: 'v' -> ['a', 'b', 'c']
+  llvm::MapVector<StringRef, SmallVector<StringRef, 0>> provideMap;
 };
 
 LLVM_LIBRARY_VISIBILITY extern std::unique_ptr<LinkerScript> script;
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 3bb1de99480f30..f90ce6fa740757 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Support/TimeProfiler.h"
 #include <cassert>
 #include <limits>
+#include <optional>
 #include <vector>
 
 using namespace llvm;
@@ -138,6 +139,10 @@ class ScriptParser final : ScriptLexer {
 
   // A set to detect an INCLUDE() cycle.
   StringSet<> seen;
+
+  // If we are currently parsing a PROVIDE|PROVIDE_HIDDEN command,
+  // then this member is set to the PROVIDE symbol name.
+  std::optional<llvm::StringRef> activeProvideSym;
 };
 } // namespace
 
@@ -1055,6 +1060,9 @@ SymbolAssignment *ScriptParser::readProvideHidden(bool provide, bool hidden) {
       ;
     return nullptr;
   }
+  llvm::SaveAndRestore saveActiveProvideSym(activeProvideSym);
+  if (provide)
+    activeProvideSym = name;
   SymbolAssignment *cmd = readSymbolAssignment(name);
   cmd->provide = provide;
   cmd->hidden = hidden;
@@ -1570,7 +1578,10 @@ Expr ScriptParser::readPrimary() {
     tok = unquote(tok);
   else if (!isValidSymbolName(tok))
     setError("malformed number: " + tok);
-  script->referencedSymbols.push_back(tok);
+  if (activeProvideSym)
+    script->provideMap[*activeProvideSym].push_back(tok);
+  else
+    script->referencedSymbols.push_back(tok);
   return [=] { return script->getSymbolValue(tok, location); };
 }
 
diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index b3d97e4de77906..258a78ab40bb57 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -333,3 +333,7 @@ void SymbolTable::scanVersionScript() {
   // --dynamic-list.
   handleDynamicList();
 }
+
+Symbol *SymbolTable::addUnusedUndefined(StringRef name, uint8_t binding) {
+  return addSymbol(Undefined{ctx.internalFile, name, binding, STV_DEFAULT, 0});
+}
diff --git a/lld/ELF/SymbolTable.h b/lld/ELF/SymbolTable.h
index 37e31d23729637..269f7f284bc734 100644
--- a/lld/ELF/SymbolTable.h
+++ b/lld/ELF/SymbolTable.h
@@ -57,6 +57,9 @@ class SymbolTable {
 
   void handleDynamicList();
 
+  Symbol *addUnusedUndefined(StringRef name,
+                             uint8_t binding = llvm::ELF::STB_GLOBAL);
+
   // Set of .so files to not link the same shared object file more than once.
   llvm::DenseMap<llvm::CachedHashStringRef, SharedFile *> soNames;
 
diff --git a/lld/test/ELF/gc-sections-with-provide.s b/lld/test/ELF/gc-sections-with-provide.s
new file mode 100644
index 00000000000000..3e5b1b1efe6ca8
--- /dev/null
+++ b/lld/test/ELF/gc-sections-with-provide.s
@@ -0,0 +1,60 @@
+# REQUIRES: x86
+
+# This test verifies that garbage-collection is correctly garbage collecting
+# unused sections when the symbol of the unused section is only referred by
+# an unused PROVIDE symbol.
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+# RUN: ld.lld -o a_nogc a.o -T script.t
+# RUN: llvm-nm a_nogc | FileCheck -check-prefix=NOGC %s
+# RUN: ld.lld -o a_gc a.o --gc-sections --print-gc-sections -T script.t | FileCheck --check-prefix=GC_LINK %s
+# RUN: llvm-nm a_gc | FileCheck -check-prefix=GC %s
+
+NOGC-NOT: another_unused
+NOGC: another_used
+NOGC: bar
+NOGC: baz
+NOGC: baz_ref
+NOGC: foo
+NOGC-NOT: unused
+NOGC: used
+
+GC_LINK: removing unused section a.o:(.text.bar)
+
+GC-NOT: another_unused
+GC: another_used
+GC-NOT: bar
+GC: baz
+GC: baz_ref
+GC: foo
+GC-NOT: unused
+GC: used
+
+#--- a.s
+.global _start
+_start:
+ call foo
+ call used
+
+.section .text.foo,"ax", at progbits
+foo:
+ nop
+
+.section .text.bar,"ax", at progbits
+.global bar
+bar:
+ nop
+
+.section .text.baz,"ax", at progbits
+.global baz
+baz:
+ nop
+
+
+#--- script.t
+PROVIDE(unused = bar + used);
+PROVIDE(used = another_used);
+PROVIDE(baz_ref = baz);
+PROVIDE(another_used = baz_ref);
+PROVIDE(another_unused = unused + bar + 0x1);
diff --git a/lld/test/ELF/linkerscript/symbolreferenced.s b/lld/test/ELF/linkerscript/symbolreferenced.s
index ba7a7721ea9697..6f583d20e27641 100644
--- a/lld/test/ELF/linkerscript/symbolreferenced.s
+++ b/lld/test/ELF/linkerscript/symbolreferenced.s
@@ -21,11 +21,31 @@
 # RUN: ld.lld -o chain -T chain.t a.o
 # RUN: llvm-nm chain | FileCheck %s
 
-# CHECK:      0000000000001000 a f1
-# CHECK-NEXT: 0000000000001000 A f2
-# CHECK-NEXT: 0000000000001000 a g1
-# CHECK-NEXT: 0000000000001000 A g2
-# CHECK-NEXT: 0000000000001000 A newsym
+# CHECK-NOT: another_unused
+# CHECK:      0000000000007000 a f1
+# CHECK-NEXT: 0000000000007000 A f2
+# CHECK-NEXT: 0000000000007000 A f3
+# CHECK-NEXT: 0000000000007000 A f4
+# CHECK-NEXT: 0000000000006000 A f5
+# CHECK-NEXT: 0000000000003000 A f6
+# CHECK-NEXT: 0000000000001000 A f7
+# CHECK-NOT: g1
+# CHECK-NOT: g2
+# CHECK-NEXT: 0000000000007500 A newsym
+# CHECK: 0000000000002000 A u
+# CHECK-NOT: unused
+# CHECK-NEXT: 0000000000002000 A v
+# CHECK-NEXT: 0000000000002000 A w
+
+
+# RUN: ld.lld -o chain_with_cycle -T chain_with_cycle.t a.o
+# RUN: llvm-nm chain_with_cycle | FileCheck %s --check-prefix=CHAIN_WITH_CYCLE
+
+# CHAIN_WITH_CYCLE: 000 A f1
+# CHAIN_WITH_CYCLE: 000 A f2
+# CHAIN_WITH_CYCLE: 000 A f3
+# CHAIN_WITH_CYCLE: 000 A f4
+# CHAIN_WITH_CYCLE: 000 A newsym
 
 # RUN: not ld.lld -T chain2.t a.o 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error:
 # ERR-COUNT-3: error: chain2.t:1: symbol not found: undef
@@ -40,13 +60,30 @@ patatino:
   movl newsym, %eax
 
 #--- chain.t
-PROVIDE(f2 = 0x1000);
+PROVIDE(f7 = 0x1000);
+PROVIDE(f5 = f6 + 0x3000);
+PROVIDE(f6 = f7 + 0x2000);
+PROVIDE(f4 = f5 + 0x1000);
+PROVIDE(f3 = f4);
+PROVIDE(f2 = f3);
 PROVIDE_HIDDEN(f1 = f2);
-PROVIDE(newsym = f1);
+PROVIDE(newsym = f1 + 0x500);
+
+u = v;
+PROVIDE(w = 0x2000);
+PROVIDE(v = w);
 
 PROVIDE(g2 = 0x1000);
 PROVIDE_HIDDEN(g1 = g2);
 PROVIDE(unused = g1);
+PROVIDE_HIDDEN(another_unused = g1);
+
+#--- chain_with_cycle.t
+PROVIDE(f1 = f2 + f3);
+PROVIDE(f2 = f3 + f4);
+PROVIDE(f3 = f4);
+PROVIDE(f4 = f1);
+PROVIDE(newsym = f1);
 
 #--- chain2.t
 PROVIDE(f2 = undef);



More information about the llvm-commits mailing list