[Lldb-commits] [clang] [lldb] [llvm] [BOLT][DWARF][NFC] Refactor address writers (PR #98094)

Mon Jul 8 15:51:57 PDT 2024

https://github.com/sayhaan created https://github.com/llvm/llvm-project/pull/98094

Refactors address writers to create an instance for each CU and its DWO CU.

>From b2fe35ae825dc757ea1daaf49142e789c4a560fc Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at meta.com>
Date: Tue, 1 Jun 2021 11:37:41 -0700
Subject: [PATCH 1/8] Rebase: [Facebook] Add clang driver options to test debug
 info and BOLT

Summary:
This is an essential piece of infrastructure for us to be
continuously testing debug info with BOLT. We can't only make changes
to a test repo because we need to change debuginfo tests to call BOLT,
hence, this diff needs to sit in our opensource repo. But when upstreaming
to LLVM, this should be kept BOLT-only outside of LLVM. When upstreaming,
we need to git diff and check all folders that are being modified by our
commits and discard this one (and leave as an internal diff).

To test BOLT in debuginfo tests, configure it with -DLLVM_TEST_BOLT=ON.
Then run check-lldb and check-debuginfo.

Manual rebase conflict history:
https://phabricator.intern.facebook.com/D29205224
https://phabricator.intern.facebook.com/D29564078
https://phabricator.intern.facebook.com/D33289118
https://phabricator.intern.facebook.com/D34957174
https://phabricator.intern.facebook.com/D35317341

Test Plan:
tested locally
Configured with:
-DLLVM_ENABLE_PROJECTS="clang;lld;lldb;compiler-rt;bolt;debuginfo-tests"
-DLLVM_TEST_BOLT=ON
Ran test suite with:
ninja check-debuginfo
ninja check-lldb

Reviewers: maks, #llvm-bolt

Reviewed By: maks

Subscribers: ayermolo, phabricatorlinter

Differential Revision: https://phabricator.intern.facebook.com/D46256657

Tasks: T92898286
---
 clang/include/clang/Driver/Options.td  |  4 ++++
 clang/lib/Driver/ToolChains/Gnu.cpp    | 29 ++++++++++++++++++++++++++
 cross-project-tests/lit.cfg.py         | 14 ++++++++++++-
 cross-project-tests/lit.site.cfg.py.in |  4 ++++
 lldb/test/API/lit.cfg.py               |  5 +++++
 lldb/test/API/lit.site.cfg.py.in       |  8 +++++++
 lldb/test/Shell/helper/toolchain.py    |  5 +++++
 lldb/test/Shell/lit.site.cfg.py.in     |  9 ++++++++
 llvm/CMakeLists.txt                    |  4 ++++
 9 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d44faa55c456f..63bb86717bb14 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5483,6 +5483,10 @@ def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">,
   MarshallingInfoFlag<CodeGenOpts<"InstrumentForProfiling">>;
 def pipe : Flag<["-", "--"], "pipe">,
   HelpText<"Use pipes between commands, when possible">;
+// Facebook T92898286
+def post_link_optimize : Flag<["--"], "post-link-optimize">,
+  HelpText<"Apply post-link optimizations using BOLT">;
+// End Facebook T92898286
 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
 def prebind : Flag<["-"], "prebind">;
 def preload : Flag<["-"], "preload">;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index b141e5f2adfab..f7611af5763ab 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -672,12 +672,41 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // Facebook T92898286
+  if (Args.hasArg(options::OPT_post_link_optimize))
+    CmdArgs.push_back("-q");
+  // End Facebook T92898286
+
   Args.AddAllArgs(CmdArgs, options::OPT_T);
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),
                                          Exec, CmdArgs, Inputs, Output));
+  // Facebook T92898286
+  if (!Args.hasArg(options::OPT_post_link_optimize) || !Output.isFilename())
+    return;
+
+  const char *MvExec = Args.MakeArgString(ToolChain.GetProgramPath("mv"));
+  ArgStringList MoveCmdArgs;
+  MoveCmdArgs.push_back(Output.getFilename());
+  const char *PreBoltBin =
+      Args.MakeArgString(Twine(Output.getFilename()) + ".pre-bolt");
+  MoveCmdArgs.push_back(PreBoltBin);
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         MvExec, MoveCmdArgs, std::nullopt));
+
+  ArgStringList BoltCmdArgs;
+  const char *BoltExec =
+      Args.MakeArgString(ToolChain.GetProgramPath("llvm-bolt"));
+  BoltCmdArgs.push_back(PreBoltBin);
+  BoltCmdArgs.push_back("-reorder-blocks=reverse");
+  BoltCmdArgs.push_back("-update-debug-sections");
+  BoltCmdArgs.push_back("-o");
+  BoltCmdArgs.push_back(Output.getFilename());
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         BoltExec, BoltCmdArgs, std::nullopt));
+  // End Facebook T92898286
 }
 
 void tools::gnutools::Assembler::ConstructJob(Compilation &C,
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index 774c4eaf4d976..619634578dfe6 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -84,7 +84,13 @@ def get_required_attr(config, attr_name):
 # use_clang() and use_lld() respectively, so set them to "", if needed.
 if not hasattr(config, "clang_src_dir"):
     config.clang_src_dir = ""
-llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# Facebook T92898286
+should_test_bolt = get_required_attr(config, "llvm_test_bolt")
+if should_test_bolt:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects), additional_flags=["--post-link-optimize"])
+else:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# End Facebook T92898286
 
 if not hasattr(config, "lld_src_dir"):
     config.lld_src_dir = ""
@@ -293,3 +299,9 @@ def get_clang_default_dwarf_version_string(triple):
 # Allow 'REQUIRES: XXX-registered-target' in tests.
 for arch in config.targets_to_build:
     config.available_features.add(arch.lower() + "-registered-target")
+
+# Facebook T92898286
+# Ensure the user's PYTHONPATH is included.
+if "PYTHONPATH" in os.environ:
+    config.environment["PYTHONPATH"] = os.environ["PYTHONPATH"]
+# End Facebook T92898286
diff --git a/cross-project-tests/lit.site.cfg.py.in b/cross-project-tests/lit.site.cfg.py.in
index 39458dfc79afd..2d53cd377f033 100644
--- a/cross-project-tests/lit.site.cfg.py.in
+++ b/cross-project-tests/lit.site.cfg.py.in
@@ -21,6 +21,10 @@ config.mlir_src_root = "@MLIR_SOURCE_DIR@"
 
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index d934349fe3ca3..d4a62c51458cc 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -248,6 +248,11 @@ def delete_module_cache(path):
 if is_configured("lldb_framework_dir"):
     dotest_cmd += ["--framework", config.lldb_framework_dir]
 
+# Facebook T92898286
+if is_configured("llvm_test_bolt"):
+    dotest_cmd += ["-E", '"--post-link-optimize"']
+# End Facebook T92898286
+
 if (
     "lldb-repro-capture" in config.available_features
     or "lldb-repro-replay" in config.available_features
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 8b2d09ae41cd2..602f45759e48f 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -1,5 +1,9 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -39,6 +43,10 @@ config.libcxx_include_target_dir = "@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@"
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 # Plugins
 lldb_build_intel_pt = '@LLDB_BUILD_INTEL_PT@'
 if lldb_build_intel_pt == '1':
diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py
index 255955fc70d8c..7b7be06643166 100644
--- a/lldb/test/Shell/helper/toolchain.py
+++ b/lldb/test/Shell/helper/toolchain.py
@@ -165,6 +165,11 @@ def use_support_substitutions(config):
     if config.cmake_sysroot:
         host_flags += ["--sysroot={}".format(config.cmake_sysroot)]
 
+    # Facebook T92898286
+    if config.llvm_test_bolt:
+        host_flags += ["--post-link-optimize"]
+    # End Facebook T92898286
+
     host_flags = " ".join(host_flags)
     config.substitutions.append(("%clang_host", "%clang " + host_flags))
     config.substitutions.append(("%clangxx_host", "%clangxx " + host_flags))
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index b69e7bce1bc0b..fe8323734b7db 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -1,5 +1,10 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -31,6 +36,10 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 3208147101c0d..ecd36d2564e4f 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -709,6 +709,10 @@ set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm and --gdb-index when linking." OFF)
 
+# Facebook T92898286
+option(LLVM_TEST_BOLT "Enable BOLT testing in non-BOLT tests that use clang" OFF)
+# End Facebook T92898286
+
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX"))

>From 64454a75323156a1b16eed27b54f7ab5be49d828 Mon Sep 17 00:00:00 2001
From: Rafael Auler <rafaelauler at fb.com>
Date: Thu, 5 Aug 2021 14:17:07 -0700
Subject: [PATCH 2/8] Rebase: [Facebook] [MC] Introduce NeverAlign fragment
 type

Summary:
Introduce NeverAlign fragment type.

The intended usage of this fragment is to insert it before a pair of
macro-op fusion eligible instructions. NeverAlign fragment ensures that
the next fragment (first instruction in the pair) does not end at a
given alignment boundary by emitting a minimal size nop if necessary.

In effect, it ensures that a pair of macro-fusible instructions is not
split by a given alignment boundary, which is a precondition for
macro-op fusion in modern Intel Cores (64B = cache line size, see Intel
Architecture Optimization Reference Manual, 2.3.2.1 Legacy Decode
Pipeline: Macro-Fusion).

This patch introduces functionality used by BOLT when emitting code with
MacroFusion alignment already in place.

The use case is different from BoundaryAlign and instruction bundling:
- BoundaryAlign can be extended to perform the desired alignment for the
first instruction in the macro-op fusion pair (D101817). However, this
approach has higher overhead due to reliance on relaxation as
BoundaryAlign requires in the general case - see
https://reviews.llvm.org/D97982#2710638.
- Instruction bundling: the intent of NeverAlign fragment is to prevent
the first instruction in a pair ending at a given alignment boundary, by
inserting at most one minimum size nop. It's OK if either instruction
crosses the cache line. Padding both instructions using bundles to not
cross the alignment boundary would result in excessive padding. There's
no straightforward way to request instruction bundling to avoid a given
end alignment for the first instruction in the bundle.

LLVM: https://reviews.llvm.org/D97982

Manual rebase conflict history:
https://phabricator.intern.facebook.com/D30142613

Test Plan: sandcastle

Reviewers: #llvm-bolt

Subscribers: phabricatorlinter

Differential Revision: https://phabricator.intern.facebook.com/D31361547
---
 bolt/lib/Core/BinaryEmitter.cpp               |   1 +
 llvm/include/llvm/MC/MCFragment.h             |  22 ++
 llvm/include/llvm/MC/MCObjectStreamer.h       |   2 +
 llvm/include/llvm/MC/MCStreamer.h             |   6 +
 llvm/lib/MC/MCAssembler.cpp                   | 118 ++++++----
 llvm/lib/MC/MCFragment.cpp                    |  12 +
 llvm/lib/MC/MCObjectStreamer.cpp              |   5 +
 llvm/lib/MC/MCStreamer.cpp                    |   2 +
 .../lib/Target/X86/AsmParser/X86AsmParser.cpp |  24 ++
 llvm/test/MC/X86/directive-avoid_end_align.s  | 208 ++++++++++++++++++
 10 files changed, 363 insertions(+), 37 deletions(-)
 create mode 100644 llvm/test/MC/X86/directive-avoid_end_align.s

diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 5793963f9b80d..c231fffa0d5ff 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -487,6 +487,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
         // This assumes the second instruction in the macro-op pair will get
         // assigned to its own MCRelaxableFragment. Since all JCC instructions
         // are relaxable, we should be safe.
+        Streamer.emitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64, *BC.STI);
       }
 
       if (!EmitCodeOnly) {
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index a9b19dc56f16a..256d98423e030 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -33,6 +33,7 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
 public:
   enum FragmentType : uint8_t {
     FT_Align,
+    FT_NeverAlign,
     FT_Data,
     FT_CompactEncodedInst,
     FT_Fill,
@@ -344,6 +345,27 @@ class MCAlignFragment : public MCFragment {
   }
 };
 
+class MCNeverAlignFragment : public MCFragment {
+  /// The alignment the end of the next fragment should avoid.
+  unsigned Alignment;
+
+  /// When emitting Nops some subtargets have specific nop encodings.
+  const MCSubtargetInfo &STI;
+
+public:
+  MCNeverAlignFragment(unsigned Alignment, const MCSubtargetInfo &STI,
+                       MCSection *Sec = nullptr)
+      : MCFragment(FT_NeverAlign, false, Sec), Alignment(Alignment), STI(STI) {}
+
+  unsigned getAlignment() const { return Alignment; }
+
+  const MCSubtargetInfo &getSubtargetInfo() const { return STI; }
+
+  static bool classof(const MCFragment *F) {
+    return F->getKind() == MCFragment::FT_NeverAlign;
+  }
+};
+
 class MCFillFragment : public MCFragment {
   uint8_t ValueSize;
   /// Value to use for filling bytes.
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index e212d54613980..c7d760721e369 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -157,6 +157,8 @@ class MCObjectStreamer : public MCStreamer {
                             unsigned MaxBytesToEmit = 0) override;
   void emitCodeAlignment(Align ByteAlignment, const MCSubtargetInfo *STI,
                          unsigned MaxBytesToEmit = 0) override;
+  void emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                               const MCSubtargetInfo &STI) override;
   void emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                          SMLoc Loc) override;
   void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column,
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index b7468cf70a664..dd813192d9ca0 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -887,6 +887,12 @@ class MCStreamer {
   virtual void emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
                                  unsigned MaxBytesToEmit = 0);
 
+  /// If the end of the fragment following this NeverAlign fragment ever gets
+  /// aligned to \p ByteAlignment, this fragment emits a single nop before the
+  /// following fragment to break this end-alignment.
+  virtual void emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                       const MCSubtargetInfo &STI);
+
   /// Emit some number of copies of \p Value until the byte offset \p
   /// Offset is reached.
   ///
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index ad30b5ce9e631..62baeb93ea7d0 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -298,6 +298,43 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
   return IsResolved;
 }
 
+/// Check if the branch crosses the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch cross the boundary.
+static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
+                             Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (StartAddr >> Log2(BoundaryAlignment)) !=
+         ((EndAddr - 1) >> Log2(BoundaryAlignment));
+}
+
+/// Check if the branch is against the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch is against the boundary.
+static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
+                              Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
+}
+
+/// Check if the branch needs padding.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch needs padding.
+static bool needPadding(uint64_t StartAddr, uint64_t Size,
+                        Align BoundaryAlignment) {
+  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
+         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
+}
+
 uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
                                           const MCFragment &F) const {
   assert(getBackendPtr() && "Requires assembler backend");
@@ -358,6 +395,41 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return Size;
   }
 
+  case MCFragment::FT_NeverAlign: {
+    // Disclaimer: NeverAlign fragment size depends on the size of its immediate
+    // successor, but NeverAlign need not be a MCRelaxableFragment.
+    // NeverAlign fragment size is recomputed if the successor is relaxed:
+    // - If RelaxableFragment is relaxed, it gets invalidated by marking its
+    // predecessor as LastValidFragment.
+    // - This forces the assembler to call MCAsmLayout::layoutFragment on that
+    // relaxable fragment, which in turn will always ask the predecessor to
+    // compute its size (see "computeFragmentSize(prev)" in layoutFragment).
+    //
+    // In short, the simplest way to ensure that computeFragmentSize() is sane
+    // is to establish the following rule: it should never examine fragments
+    // after the current fragment in the section. If we logically need to
+    // examine any fragment after the current fragment, we need to do that using
+    // relaxation, inside MCAssembler::layoutSectionOnce.
+    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
+    const MCFragment *NF = F.getNextNode();
+    uint64_t Offset = Layout.getFragmentOffset(&NAF);
+    size_t NextFragSize = 0;
+    if (const auto *NextFrag = dyn_cast<MCRelaxableFragment>(NF)) {
+      NextFragSize = NextFrag->getContents().size();
+    } else if (const auto *NextFrag = dyn_cast<MCDataFragment>(NF)) {
+      NextFragSize = NextFrag->getContents().size();
+    } else {
+      llvm_unreachable("Didn't find the expected fragment after NeverAlign");
+    }
+    // Check if the next fragment ends at the alignment we want to avoid.
+    if (isAgainstBoundary(Offset, NextFragSize, Align(NAF.getAlignment()))) {
+      // Avoid this alignment by introducing minimum nop.
+      assert(getBackend().getMinimumNopSize() != NAF.getAlignment());
+      return getBackend().getMinimumNopSize();
+    }
+    return 0;
+  }
+
   case MCFragment::FT_Org: {
     const MCOrgFragment &OF = cast<MCOrgFragment>(F);
     MCValue Value;
@@ -581,6 +653,15 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     break;
   }
 
+  case MCFragment::FT_NeverAlign: {
+    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
+    if (!Asm.getBackend().writeNopData(OS, FragmentSize,
+                                       &NAF.getSubtargetInfo()))
+      report_fatal_error("unable to write nop sequence of " +
+                         Twine(FragmentSize) + " bytes");
+    break;
+  }
+
   case MCFragment::FT_Data:
     ++stats::EmittedDataFragments;
     OS << cast<MCDataFragment>(F).getContents();
@@ -1052,43 +1133,6 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
   return OldSize != LF.getContents().size();
 }
 
-/// Check if the branch crosses the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch cross the boundary.
-static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
-                             Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (StartAddr >> Log2(BoundaryAlignment)) !=
-         ((EndAddr - 1) >> Log2(BoundaryAlignment));
-}
-
-/// Check if the branch is against the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch is against the boundary.
-static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
-                              Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
-}
-
-/// Check if the branch needs padding.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch needs padding.
-static bool needPadding(uint64_t StartAddr, uint64_t Size,
-                        Align BoundaryAlignment) {
-  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
-         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
-}
-
 bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
                                      MCBoundaryAlignFragment &BF) {
   // BoundaryAlignFragment that doesn't need to align any fragment should not be
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 7d0826802d0af..6e09caaab0957 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -274,6 +274,9 @@ void MCFragment::destroy() {
     case FT_Align:
       delete cast<MCAlignFragment>(this);
       return;
+    case FT_NeverAlign:
+      delete cast<MCNeverAlignFragment>(this);
+      return;
     case FT_Data:
       delete cast<MCDataFragment>(this);
       return;
@@ -342,6 +345,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   OS << "<";
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
+  case MCFragment::FT_NeverAlign:
+    OS << "MCNeverAlignFragment";
+    break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
   case MCFragment::FT_CompactEncodedInst:
     OS << "MCCompactEncodedInstFragment"; break;
@@ -381,6 +387,12 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
        << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
     break;
   }
+  case MCFragment::FT_NeverAlign: {
+    const MCNeverAlignFragment *NAF = cast<MCNeverAlignFragment>(this);
+    OS << "\n       ";
+    OS << " Alignment:" << NAF->getAlignment() << ">";
+    break;
+  }
   case MCFragment::FT_Data:  {
     const auto *DF = cast<MCDataFragment>(this);
     OS << "\n       ";
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 0ccade91677a4..117475b7dd90b 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -658,6 +658,11 @@ void MCObjectStreamer::emitCodeAlignment(Align Alignment,
   cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true, STI);
 }
 
+void MCObjectStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                               const MCSubtargetInfo &STI) {
+  insert(new MCNeverAlignFragment(ByteAlignment, STI));
+}
+
 void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
                                          unsigned char Value,
                                          SMLoc Loc) {
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 199d865ea3496..a97cba6c89972 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1235,6 +1235,8 @@ void MCStreamer::emitValueToAlignment(Align Alignment, int64_t Value,
                                       unsigned MaxBytesToEmit) {}
 void MCStreamer::emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
                                    unsigned MaxBytesToEmit) {}
+void MCStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                         const MCSubtargetInfo &STI) {}
 void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                                    SMLoc Loc) {}
 void MCStreamer::emitBundleAlignMode(Align Alignment) {}
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 6623106109316..6c6bd2cf31e86 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1153,6 +1153,7 @@ class X86AsmParser : public MCTargetAsmParser {
   bool parseDirectiveArch();
   bool parseDirectiveNops(SMLoc L);
   bool parseDirectiveEven(SMLoc L);
+  bool parseDirectiveAvoidEndAlign(SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
   /// CodeView FPO data directives.
@@ -4601,6 +4602,8 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   } else if (IDVal == ".nops")
     return parseDirectiveNops(DirectiveID.getLoc());
+  else if (IDVal == ".avoid_end_align")
+    return parseDirectiveAvoidEndAlign(DirectiveID.getLoc());
   else if (IDVal == ".even")
     return parseDirectiveEven(DirectiveID.getLoc());
   else if (IDVal == ".cv_fpo_proc")
@@ -4695,6 +4698,27 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
   return false;
 }
 
+/// Directive for NeverAlign fragment testing, not for general usage!
+/// parseDirectiveAvoidEndAlign
+///  ::= .avoid_end_align alignment
+bool X86AsmParser::parseDirectiveAvoidEndAlign(SMLoc L) {
+  int64_t Alignment = 0;
+  SMLoc AlignmentLoc;
+  AlignmentLoc = getTok().getLoc();
+  if (getParser().checkForValidSection() ||
+      getParser().parseAbsoluteExpression(Alignment))
+    return true;
+
+  if (getParser().parseEOL("unexpected token in directive"))
+    return true;
+
+  if (Alignment <= 0)
+    return Error(AlignmentLoc, "expected a positive alignment");
+
+  getParser().getStreamer().emitNeverAlignCodeAtEnd(Alignment, getSTI());
+  return false;
+}
+
 /// ParseDirectiveCode
 ///  ::= .code16 | .code32 | .code64
 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
diff --git a/llvm/test/MC/X86/directive-avoid_end_align.s b/llvm/test/MC/X86/directive-avoid_end_align.s
new file mode 100644
index 0000000000000..1d748401edc12
--- /dev/null
+++ b/llvm/test/MC/X86/directive-avoid_end_align.s
@@ -0,0 +1,208 @@
+# RUN: llvm-mc -triple=x86_64 -filetype=obj %s | llvm-objdump --no-show-raw-insn -d - | FileCheck %s
+# RUN: not llvm-mc -triple=x86_64 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
+
+# avoid_end_align has no effect since test doesn't end at alignment boundary:
+.avoid_end_align 64
+# CHECK-NOT: nop
+  testl %eax, %eax
+# CHECK: testl %eax, %eax
+  je  .LBB0
+
+.fill 58, 1, 0x00
+# NeverAlign followed by MCDataFragment:
+# avoid_end_align inserts nop because `test` would end at alignment boundary:
+.avoid_end_align 64
+# CHECK: 			3e: nop
+  testl %eax, %eax
+# CHECK-NEXT: 3f: testl %eax, %eax
+  je  .LBB0
+# CHECK-NEXT: 41: je
+.LBB0:
+  retq
+
+.p2align 6
+.L0:
+.nops 57
+  int3
+# NeverAlign followed by RelaxableFragment:
+.avoid_end_align 64
+# CHECK: 			ba: nop
+  cmpl $(.L1-.L0), %eax
+# CHECK-NEXT: bb: cmpl
+  je  .L0
+# CHECK-NEXT: c1: je
+.nops 65
+.L1:
+
+###############################################################################
+# Experiment A:
+# Check that NeverAlign doesn't introduce infinite loops in layout.
+# Control:
+# 1. NeverAlign fragment is not added,
+# 2. Short formats of cmp and jcc are used (3 and 2 bytes respectively),
+# 3. cmp and jcc are placed such that to be split by 64B alignment boundary.
+# 4. jcc would be relaxed to a longer format if at least one byte is added
+#    between .L10 and je itself, e.g. by adding a NeverAlign padding byte,
+#    or relaxing cmp instruction.
+# 5. cmp would be relaxed to a longer format if at least one byte is added
+#    between .L11 and .L12, e.g. due to relaxing jcc instruction.
+.p2align 6
+# CHECK:      140: int3
+.fill 2, 1, 0xcc
+.L10:
+.nops 122
+  int3
+# CHECK:      1bc: int3
+# no avoid_end_align here
+# CHECK-NOT:  nop
+  cmp $(.L12-.L11), %eax
+# CHECK:      1bd: cmpl
+.L11:
+  je  .L10
+# CHECK-NEXT: 1c0: je
+.nops 125
+.L12:
+
+# Experiment:
+# Same setup as control, except NeverAlign fragment is added before cmp.
+# Expected effect:
+# 1. NeverAlign pads cmp+jcc by one byte since cmp and jcc are split by a 64B
+#    alignment boundary,
+# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4),
+# 3. This results in an cmp relaxation (Control rule #5),
+# 4. Which in turn makes NeverAlign fragment unnecessary as cmp and jcc
+#    are no longer split by an alignment boundary (cmp crosses the boundary).
+# 5. NeverAlign padding is removed.
+# 6. cmp and jcc instruction remain in relaxed form.
+# 7. Relaxation converges, layout succeeds.
+.p2align 6
+# CHECK:      240: int3
+.fill 2, 1, 0xcc
+.L20:
+.nops 122
+  int3
+# CHECK:      2bc: int3
+.avoid_end_align 64
+# CHECK-NOT: 	nop
+  cmp $(.L22-.L21), %eax
+# CHECK-NEXT: 2bd: cmpl
+.L21:
+  je  .L20
+# CHECK-NEXT: 2c3: je
+.nops 125
+.L22:
+
+###############################################################################
+# Experiment B: similar to exp A, but we check that once NeverAlign padding is
+# removed from the layout (exp A, experiment step 5), the increased distance
+# between the symbols L33 and L34 triggers the relaxation of instruction at
+# label L32.
+#
+# Control 1: using a one-byte instruction at L33 (site of NeverAlign) leads to
+# steps 2-3 of exp A, experiment:
+# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4),
+# 3. This results in an cmp relaxation (Control rule #5),
+# => short cmp under L32
+.p2align 6
+# CHECK:      380: int3
+.fill 2, 1, 0xcc
+.L30:
+.nops 122
+  int3
+# CHECK:      3fc: int3
+  hlt
+#.avoid_end_align 64
+.L33:
+  cmp $(.L32-.L31), %eax
+# CHECK:      3fe: cmpl
+.L31:
+  je  .L30
+# CHECK-NEXT: 404: je
+.nops 114
+.p2align 1
+  int3
+  int3
+# CHECK:      47c: int3
+.L34:
+.nops 9
+.L32:
+  cmp $(.L33-.L34), %eax
+# CHECK:      487: cmp
+# note that the size of cmp is 48a-487 == 3 bytes (distance is exactly -128)
+  int3
+# CHECK-NEXT: 48a: int3
+
+# Control 2: leaving out a byte at L43 (site of NeverAlign), plus
+# relaxed jcc and cmp leads to a relaxed cmp under L42 (-129 as cmp's immediate)
+.p2align 6
+# CHECK:      4c0: int3
+.fill 2, 1, 0xcc
+.L40:
+.nops 122
+  int3
+# CHECK:      53c: int3
+#  int3
+#.avoid_end_align 64
+.L43:
+  cmp $(.L42-.L41+0x100), %eax
+# CHECK:      53d: cmpl
+.L41:
+  je  .L40+0x100
+# CHECK-NEXT: 543: je
+.nops 114
+.p2align 1
+  int3
+  int3
+# CHECK:      5bc: int3
+.L44:
+.nops 9
+.L42:
+  cmp $(.L43-.L44), %eax
+# CHECK:      5c7: cmp
+# note that the size of cmp is 5cd-5c7 == 6 bytes (distance is exactly -129)
+  int3
+# CHECK-NEXT: 5cd: int3
+
+# Experiment
+# Checking if removing NeverAlign padding at L53 as a result of alignment and
+# relaxation of cmp and jcc following it (see exp A), thus reproducing the case
+# in Control 2 (getting a relaxed cmp under L52), is handled correctly.
+.p2align 6
+# CHECK:      600: int3
+.fill 2, 1, 0xcc
+.L50:
+.nops 122
+  int3
+# CHECK:      67c: int3
+.avoid_end_align 64
+.L53:
+# CHECK-NOT: 	nop
+  cmp $(.L52-.L51), %eax
+# CHECK-NEXT: 67d: cmpl
+.L51:
+  je  .L50
+# CHECK-NEXT: 683: je
+.nops 114
+.p2align 1
+  int3
+  int3
+# CHECK:      6fc: int3
+.L54:
+.nops 9
+.L52:
+  cmp $(.L53-.L54), %eax
+# CHECK:      707: cmp
+# note that the size of cmp is 70d-707 == 6 bytes (distance is exactly -129)
+  int3
+# CHECK-NEXT: 70d: int3
+
+.ifdef ERR
+# ERR: {{.*}}.s:[[#@LINE+1]]:17: error: unknown token in expression
+.avoid_end_align
+# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected absolute expression
+.avoid_end_align x
+# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected a positive alignment
+.avoid_end_align 0
+# ERR: {{.*}}.s:[[#@LINE+1]]:20: error: unexpected token in directive
+.avoid_end_align 64, 0
+.endif

>From d20d82d3259ce0e12d8a456e764e7c97a07decdc Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at meta.com>
Date: Sun, 2 Jun 2024 11:29:35 -0700
Subject: [PATCH 3/8] Revert "[MC] Speed up AttemptToFoldSymbolOffsetDifference
 in the absence of MCAsmLayout"

Summary:
This reverts commit 92d2c1b67835248d5d7371fa06381f970687c68f.

Causes timeouts in BOLT tests

Test Plan: sandcastle

Reviewers: ayermolo, davidino, #llvm-bolt

Reviewed By: davidino

Differential Revision: https://phabricator.intern.facebook.com/D58066252

Tasks: T191101004

Tags: accept2ship, publish_when_ready
---
 llvm/lib/MC/MCExpr.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index b065d03651c45..9add574b0e5b8 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -675,14 +675,8 @@ static void AttemptToFoldSymbolOffsetDifference(
     if (FA == FB) {
       Reverse = SA.getOffset() < SB.getOffset();
     } else if (!isa<MCDummyFragment>(FA)) {
-      // Testing FA < FB is slow. Use setLayoutOrder to speed up computation.
-      // The formal layout order will be finalized in MCAssembler::layout.
-      if (FA->getLayoutOrder() == 0 || FB->getLayoutOrder()== 0) {
-        unsigned LayoutOrder = 0;
-        for (MCFragment &F : *FA->getParent())
-          F.setLayoutOrder(++LayoutOrder);
-      }
-      Reverse = FA->getLayoutOrder() < FB->getLayoutOrder();
+      Reverse = std::find_if(std::next(FA->getIterator()), SecA.end(),
+                             [&](auto &I) { return &I == FB; }) != SecA.end();
     }
 
     uint64_t SAOffset = SA.getOffset(), SBOffset = SB.getOffset();

>From e9d02dc9f7a81b6dee42608e914fb2d9a8457604 Mon Sep 17 00:00:00 2001
From: Sayhaan Siddiqui <sayhaan at meta.com>
Date: Tue, 4 Jun 2024 13:04:25 -0700
Subject: [PATCH 4/8] Replace usages of GDBIndex functions

Summary: Replace old usages of GDB Index functions to use the new class.

Test Plan: Run `ninja check llvm-bolt`

Reviewers: ayermolo, rafaelauler, aaupov, maks, #llvm-bolt

Reviewed By: ayermolo

Differential Revision: https://phabricator.intern.facebook.com/D58156978

Tasks: T187529450
---
 bolt/include/bolt/Rewrite/DWARFRewriter.h |  4 +-
 bolt/lib/Rewrite/DWARFRewriter.cpp        | 61 +++++++++++++----------
 2 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index 8dec32de9008e..3cc9d823c815b 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -12,6 +12,7 @@
 #include "bolt/Core/DIEBuilder.h"
 #include "bolt/Core/DebugData.h"
 #include "bolt/Core/DebugNames.h"
+#include "bolt/Core/GDBIndex.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DWP/DWP.h"
@@ -131,7 +132,8 @@ class DWARFRewriter {
   makeFinalLocListsSection(DWARFVersion Version);
 
   /// Finalize type sections in the main binary.
-  CUOffsetMap finalizeTypeSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer);
+  CUOffsetMap finalizeTypeSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
+                                   GDBIndex &GDBIndexSection);
 
   /// Process and write out CUs that are passsed in.
   void finalizeCompileUnits(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 8814ebbd10aa5..7b62999dfb2b6 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -185,6 +185,7 @@ namespace bolt {
 class DIEStreamer : public DwarfStreamer {
   DIEBuilder *DIEBldr;
   DWARFRewriter &Rewriter;
+  GDBIndex &GDBIndexSection;
 
 private:
   /// Emit the compilation unit header for \p Unit in the debug_info
@@ -247,7 +248,7 @@ class DIEStreamer : public DwarfStreamer {
     const uint64_t TypeSignature = cast<DWARFTypeUnit>(Unit).getTypeHash();
     DIE *TypeDIE = DIEBldr->getTypeDIE(Unit);
     const DIEBuilder::DWARFUnitInfo &UI = DIEBldr->getUnitInfoByDwarfUnit(Unit);
-    Rewriter.addGDBTypeUnitEntry(
+    GDBIndexSection.addGDBTypeUnitEntry(
         {UI.UnitOffset, TypeSignature, TypeDIE->getOffset()});
     if (Unit.getVersion() < 5) {
       // Switch the section to .debug_types section.
@@ -279,11 +280,12 @@ class DIEStreamer : public DwarfStreamer {
 
 public:
   DIEStreamer(DIEBuilder *DIEBldr, DWARFRewriter &Rewriter,
+              GDBIndex &GDBIndexSection,
               DWARFLinkerBase::OutputFileType OutFileType,
               raw_pwrite_stream &OutFile,
               DWARFLinkerBase::MessageHandlerTy Warning)
       : DwarfStreamer(OutFileType, OutFile, Warning), DIEBldr(DIEBldr),
-        Rewriter(Rewriter){};
+        Rewriter(Rewriter), GDBIndexSection(GDBIndexSection) {};
 
   using DwarfStreamer::emitCompileUnitHeader;
 
@@ -326,12 +328,11 @@ static cl::opt<bool> KeepARanges(
         "keep or generate .debug_aranges section if .gdb_index is written"),
     cl::Hidden, cl::cat(BoltCategory));
 
-static cl::opt<bool>
-DeterministicDebugInfo("deterministic-debuginfo",
-  cl::desc("disables parallel execution of tasks that may produce "
-           "nondeterministic debug info"),
-  cl::init(true),
-  cl::cat(BoltCategory));
+static cl::opt<bool> DeterministicDebugInfo(
+    "deterministic-debuginfo",
+    cl::desc("disables parallel execution of tasks that may produce "
+             "nondeterministic debug info"),
+    cl::init(true), cl::cat(BoltCategory));
 
 static cl::opt<std::string> DwarfOutputPath(
     "dwarf-output-path",
@@ -460,10 +461,11 @@ static std::optional<uint64_t> getAsAddress(const DWARFUnit &DU,
 static std::unique_ptr<DIEStreamer>
 createDIEStreamer(const Triple &TheTriple, raw_pwrite_stream &OutFile,
                   StringRef Swift5ReflectionSegmentName, DIEBuilder &DIEBldr,
-                  DWARFRewriter &Rewriter) {
+                  DWARFRewriter &Rewriter, GDBIndex &GDBIndexSection) {
 
   std::unique_ptr<DIEStreamer> Streamer = std::make_unique<DIEStreamer>(
-      &DIEBldr, Rewriter, DWARFLinkerBase::OutputFileType::Object, OutFile,
+      &DIEBldr, Rewriter, GDBIndexSection,
+      DWARFLinkerBase::OutputFileType::Object, OutFile,
       [&](const Twine &Warning, StringRef Context, const DWARFDie *) {});
   Error Err = Streamer->init(TheTriple, Swift5ReflectionSegmentName);
   if (Err)
@@ -484,13 +486,12 @@ emitUnit(DIEBuilder &DIEBldr, DIEStreamer &Streamer, DWARFUnit &Unit) {
   return {U.UnitOffset, U.UnitLength, TypeHash};
 }
 
-static void emitDWOBuilder(const std::string &DWOName,
-                           DIEBuilder &DWODIEBuilder, DWARFRewriter &Rewriter,
-                           DWARFUnit &SplitCU, DWARFUnit &CU,
-                           DWARFRewriter::DWPState &State,
-                           DebugLocWriter &LocWriter,
-                           DebugStrOffsetsWriter &StrOffstsWriter,
-                           DebugStrWriter &StrWriter) {
+static void
+emitDWOBuilder(const std::string &DWOName, DIEBuilder &DWODIEBuilder,
+               DWARFRewriter &Rewriter, DWARFUnit &SplitCU, DWARFUnit &CU,
+               DWARFRewriter::DWPState &State, DebugLocWriter &LocWriter,
+               DebugStrOffsetsWriter &StrOffstsWriter,
+               DebugStrWriter &StrWriter, GDBIndex &GDBIndexSection) {
   // Populate debug_info and debug_abbrev for current dwo into StringRef.
   DWODIEBuilder.generateAbbrevs();
   DWODIEBuilder.finish();
@@ -500,8 +501,9 @@ static void emitDWOBuilder(const std::string &DWOName,
       std::make_shared<raw_svector_ostream>(OutBuffer);
   const object::ObjectFile *File = SplitCU.getContext().getDWARFObj().getFile();
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
-  std::unique_ptr<DIEStreamer> Streamer = createDIEStreamer(
-      *TheTriple, *ObjOS, "DwoStreamerInitAug2", DWODIEBuilder, Rewriter);
+  std::unique_ptr<DIEStreamer> Streamer =
+      createDIEStreamer(*TheTriple, *ObjOS, "DwoStreamerInitAug2",
+                        DWODIEBuilder, Rewriter, GDBIndexSection);
   DWARFRewriter::UnitMetaVectorType TUMetaVector;
   DWARFRewriter::UnitMeta CUMI = {0, 0, 0};
   if (SplitCU.getContext().getMaxDWOVersion() >= 5) {
@@ -652,6 +654,7 @@ void DWARFRewriter::updateDebugInfo() {
 
   DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
                                          *StrWriter);
+  GDBIndex GDBIndexSection(BC);
   DWPState State;
   if (opts::WriteDWP)
     initDWPState(State);
@@ -704,7 +707,8 @@ void DWARFRewriter::updateDebugInfo() {
         TempRangesSectionWriter->finalizeSection();
 
       emitDWOBuilder(DWOName, DWODIEBuilder, *this, **SplitCU, *Unit, State,
-                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter);
+                     DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
+                     GDBIndexSection);
     }
 
     if (Unit->getVersion() >= 5) {
@@ -729,9 +733,10 @@ void DWARFRewriter::updateDebugInfo() {
       std::make_unique<raw_svector_ostream>(OutBuffer);
   const object::ObjectFile *File = BC.DwCtx->getDWARFObj().getFile();
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
-  std::unique_ptr<DIEStreamer> Streamer =
-      createDIEStreamer(*TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this);
-  CUOffsetMap OffsetMap = finalizeTypeSections(DIEBlder, *Streamer);
+  std::unique_ptr<DIEStreamer> Streamer = createDIEStreamer(
+      *TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this, GDBIndexSection);
+  CUOffsetMap OffsetMap =
+      finalizeTypeSections(DIEBlder, *Streamer, GDBIndexSection);
 
   const bool SingleThreadedMode =
       opts::NoThreads || opts::DeterministicDebugInfo;
@@ -761,7 +766,8 @@ void DWARFRewriter::updateDebugInfo() {
 
   finalizeDebugSections(DIEBlder, DebugNamesTable, *Streamer, *ObjOS,
                         OffsetMap);
-  updateGdbIndexSection(OffsetMap, CUIndex);
+  GDBIndexSection.updateGdbIndexSection(OffsetMap, CUIndex,
+                                        *ARangesSectionWriter);
 }
 
 void DWARFRewriter::updateUnitDebugInfo(
@@ -1429,7 +1435,8 @@ void DWARFRewriter::updateLineTableOffsets(const MCAsmLayout &Layout) {
 }
 
 CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
-                                                DIEStreamer &Streamer) {
+                                                DIEStreamer &Streamer,
+                                                GDBIndex &GDBIndexSection) {
   // update TypeUnit DW_AT_stmt_list with new .debug_line information.
   auto updateLineTable = [&](const DWARFUnit &Unit) -> void {
     DIE *UnitDIE = DIEBlder.getUnitDIEbyUnit(Unit);
@@ -1449,8 +1456,8 @@ CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
       std::make_shared<raw_svector_ostream>(OutBuffer);
   const object::ObjectFile *File = BC.DwCtx->getDWARFObj().getFile();
   auto TheTriple = std::make_unique<Triple>(File->makeTriple());
-  std::unique_ptr<DIEStreamer> TypeStreamer =
-      createDIEStreamer(*TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this);
+  std::unique_ptr<DIEStreamer> TypeStreamer = createDIEStreamer(
+      *TheTriple, *ObjOS, "TypeStreamer", DIEBlder, *this, GDBIndexSection);
 
   // generate debug_info and CUMap
   CUOffsetMap CUMap;

>From 1bed2fff4f0dbbe9e9599faeb0c20d3a95884c5b Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks at meta.com>
Date: Tue, 11 Jun 2024 10:52:51 -0700
Subject: [PATCH 5/8] [BOLT] Add auto parsing for Linux kernel .altinstructions
 (#95068)

Summary:
.altinstructions section contains a list of structures where fields can
have different sizes while other fields could be present or not
depending on the kernel version. Add automatic detection of such
variations and use it by default. The user can still overwrite the
automatic detection with `--alt-inst-has-padlen` and
`--alt-inst-feature-size` options.

https://github.com/llvm/llvm-project/pull/95068

Test Plan: sandcastle

Reviewers: aaupov, #llvm-bolt

Reviewed By: aaupov

Differential Revision: https://phabricator.intern.facebook.com/D58756460
---
 bolt/lib/Rewrite/LinuxKernelRewriter.cpp | 72 ++++++++++++++++++++++--
 bolt/test/X86/linux-alt-instruction.s    | 20 ++++---
 2 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index b2c8b2446f7e1..6b3f5bce9f0f5 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -273,6 +273,8 @@ class LinuxKernelRewriter final : public MetadataRewriter {
 
   /// Handle alternative instruction info from .altinstructions.
   Error readAltInstructions();
+  Error tryReadAltInstructions(uint32_t AltInstFeatureSize,
+                               bool AltInstHasPadLen, bool ParseOnly);
   Error rewriteAltInstructions();
 
   /// Read .pci_fixup
@@ -1319,12 +1321,69 @@ Error LinuxKernelRewriter::rewriteBugTable() {
 ///	    u8  padlen;         // present in older kernels
 ///   } __packed;
 ///
-/// Note the structures is packed.
+/// Note that the structure is packed.
+///
+/// Since the size of the "feature" field could be either u16 or u32, and
+/// "padlen" presence is unknown, we attempt to parse .altinstructions section
+/// using all possible combinations (four at this time). Since we validate the
+/// contents of the section and its size, the detection works quite well.
+/// Still, we leave the user the opportunity to specify these features on the
+/// command line and skip the guesswork.
 Error LinuxKernelRewriter::readAltInstructions() {
   AltInstrSection = BC.getUniqueSectionByName(".altinstructions");
   if (!AltInstrSection)
     return Error::success();
 
+  // Presence of "padlen" field.
+  std::vector<bool> PadLenVariants;
+  if (opts::AltInstHasPadLen.getNumOccurrences())
+    PadLenVariants.push_back(opts::AltInstHasPadLen);
+  else
+    PadLenVariants = {false, true};
+
+  // Size (in bytes) variants of "feature" field.
+  std::vector<uint32_t> FeatureSizeVariants;
+  if (opts::AltInstFeatureSize.getNumOccurrences())
+    FeatureSizeVariants.push_back(opts::AltInstFeatureSize);
+  else
+    FeatureSizeVariants = {2, 4};
+
+  for (bool AltInstHasPadLen : PadLenVariants) {
+    for (uint32_t AltInstFeatureSize : FeatureSizeVariants) {
+      LLVM_DEBUG({
+        dbgs() << "BOLT-DEBUG: trying AltInstHasPadLen = " << AltInstHasPadLen
+               << "; AltInstFeatureSize = " << AltInstFeatureSize << ";\n";
+      });
+      if (Error E = tryReadAltInstructions(AltInstFeatureSize, AltInstHasPadLen,
+                                           /*ParseOnly*/ true)) {
+        consumeError(std::move(E));
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Matched .altinstructions format\n");
+
+      if (!opts::AltInstHasPadLen.getNumOccurrences())
+        BC.outs() << "BOLT-INFO: setting --" << opts::AltInstHasPadLen.ArgStr
+                  << '=' << AltInstHasPadLen << '\n';
+
+      if (!opts::AltInstFeatureSize.getNumOccurrences())
+        BC.outs() << "BOLT-INFO: setting --" << opts::AltInstFeatureSize.ArgStr
+                  << '=' << AltInstFeatureSize << '\n';
+
+      return tryReadAltInstructions(AltInstFeatureSize, AltInstHasPadLen,
+                                    /*ParseOnly*/ false);
+    }
+  }
+
+  // We couldn't match the format. Read again to properly propagate the error
+  // to the user.
+  return tryReadAltInstructions(opts::AltInstFeatureSize,
+                                opts::AltInstHasPadLen, /*ParseOnly*/ false);
+}
+
+Error LinuxKernelRewriter::tryReadAltInstructions(uint32_t AltInstFeatureSize,
+                                                  bool AltInstHasPadLen,
+                                                  bool ParseOnly) {
   const uint64_t Address = AltInstrSection->getAddress();
   DataExtractor DE = DataExtractor(AltInstrSection->getContents(),
                                    BC.AsmInfo->isLittleEndian(),
@@ -1336,12 +1395,12 @@ Error LinuxKernelRewriter::readAltInstructions() {
         Address + Cursor.tell() + (int32_t)DE.getU32(Cursor);
     const uint64_t AltInstAddress =
         Address + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t Feature = DE.getUnsigned(Cursor, opts::AltInstFeatureSize);
+    const uint64_t Feature = DE.getUnsigned(Cursor, AltInstFeatureSize);
     const uint8_t OrgSize = DE.getU8(Cursor);
     const uint8_t AltSize = DE.getU8(Cursor);
 
     // Older kernels may have the padlen field.
-    const uint8_t PadLen = opts::AltInstHasPadLen ? DE.getU8(Cursor) : 0;
+    const uint8_t PadLen = AltInstHasPadLen ? DE.getU8(Cursor) : 0;
 
     if (!Cursor)
       return createStringError(
@@ -1358,7 +1417,7 @@ Error LinuxKernelRewriter::readAltInstructions() {
                 << "\n\tFeature: 0x" << Twine::utohexstr(Feature)
                 << "\n\tOrgSize: " << (int)OrgSize
                 << "\n\tAltSize: " << (int)AltSize << '\n';
-      if (opts::AltInstHasPadLen)
+      if (AltInstHasPadLen)
         BC.outs() << "\tPadLen:  " << (int)PadLen << '\n';
     }
 
@@ -1375,7 +1434,7 @@ Error LinuxKernelRewriter::readAltInstructions() {
 
     BinaryFunction *AltBF =
         BC.getBinaryFunctionContainingAddress(AltInstAddress);
-    if (AltBF && BC.shouldEmit(*AltBF)) {
+    if (!ParseOnly && AltBF && BC.shouldEmit(*AltBF)) {
       BC.errs()
           << "BOLT-WARNING: alternative instruction sequence found in function "
           << *AltBF << '\n';
@@ -1397,6 +1456,9 @@ Error LinuxKernelRewriter::readAltInstructions() {
                                " referenced by .altinstructions entry %d",
                                OrgInstAddress, EntryID);
 
+    if (ParseOnly)
+      continue;
+
     // There could be more than one alternative instruction sequences for the
     // same original instruction. Annotate each alternative separately.
     std::string AnnotationName = "AltInst";
diff --git a/bolt/test/X86/linux-alt-instruction.s b/bolt/test/X86/linux-alt-instruction.s
index 2cdf31519682a..66cd33a711b89 100644
--- a/bolt/test/X86/linux-alt-instruction.s
+++ b/bolt/test/X86/linux-alt-instruction.s
@@ -12,24 +12,30 @@
 ## Older kernels used to have padlen field in alt_instr. Check compatibility.
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown --defsym PADLEN=1 \
-# RUN:   %s -o %t.o
-# RUN: %clang %cflags -nostdlib %t.o -o %t.exe \
+# RUN:   %s -o %t.padlen.o
+# RUN: %clang %cflags -nostdlib %t.padlen.o -o %t.padlen.exe \
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr,--no-pie
-# RUN: llvm-bolt %t.exe --print-normalized --alt-inst-has-padlen -o %t.out \
+# RUN: llvm-bolt %t.padlen.exe --print-normalized --alt-inst-has-padlen -o %t.padlen.out \
 # RUN:   | FileCheck %s
 
 ## Check with a larger size of "feature" field in alt_instr.
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
-# RUN:   --defsym FEATURE_SIZE_4=1 %s -o %t.o
-# RUN: %clang %cflags -nostdlib %t.o -o %t.exe \
+# RUN:   --defsym FEATURE_SIZE_4=1 %s -o %t.fs4.o
+# RUN: %clang %cflags -nostdlib %t.fs4.o -o %t.fs4.exe \
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr,--no-pie
-# RUN: llvm-bolt %t.exe --print-normalized --alt-inst-feature-size=4 -o %t.out \
+# RUN: llvm-bolt %t.fs4.exe --print-normalized --alt-inst-feature-size=4 -o %t.fs4.out \
 # RUN:   | FileCheck %s
 
 ## Check that out-of-bounds read is handled properly.
 
-# RUN: not llvm-bolt %t.exe --print-normalized --alt-inst-feature-size=2 -o %t.out
+# RUN: not llvm-bolt %t.fs4.exe --alt-inst-feature-size=2 -o %t.fs4.out
+
+## Check that BOLT automatically detects structure fields in .altinstructions.
+
+# RUN: llvm-bolt %t.exe --print-normalized -o %t.out | FileCheck %s
+# RUN: llvm-bolt %t.exe --print-normalized -o %t.padlen.out | FileCheck %s
+# RUN: llvm-bolt %t.exe --print-normalized -o %t.fs4.out | FileCheck %s
 
 # CHECK:      BOLT-INFO: Linux kernel binary detected
 # CHECK:      BOLT-INFO: parsed 2 alternative instruction entries

>From 2a1c835e054ae02ecf1a1ffdcbbed1a4895ccc11 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks at meta.com>
Date: Tue, 11 Jun 2024 13:22:18 -0700
Subject: [PATCH 6/8] [BOLT] Skip optimization of functions with alt
 instructions

Summary:
Alternative instructions in the Linux kernel may modify control flow in
a function. As such, it is unsafe to optimize functions with alternative
instructions until we properly support CFG alternatives.

Previously, we marked functions with alt instructions before the
emission, but that could be too late if we remove or replace
instructions with alternatives. We could have marked functions as
non-simple immediately after reading .altinstructions, but it's nice to
be able to view functions after CFG is built. Thus assign the non-simple
status after building CFG.

https://github.com/llvm/llvm-project/pull/95172

Test Plan: sandcastle

Reviewers: rafaelauler, aaupov, #llvm-bolt

Reviewed By: aaupov

Differential Revision: https://phabricator.intern.facebook.com/D58756459
---
 bolt/lib/Rewrite/LinuxKernelRewriter.cpp | 16 +++++++---------
 bolt/test/X86/linux-alt-instruction.s    | 12 ++++++------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 6b3f5bce9f0f5..7849e6b244f63 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -273,9 +273,9 @@ class LinuxKernelRewriter final : public MetadataRewriter {
 
   /// Handle alternative instruction info from .altinstructions.
   Error readAltInstructions();
+  void processAltInstructionsPostCFG();
   Error tryReadAltInstructions(uint32_t AltInstFeatureSize,
                                bool AltInstHasPadLen, bool ParseOnly);
-  Error rewriteAltInstructions();
 
   /// Read .pci_fixup
   Error readPCIFixupTable();
@@ -326,6 +326,8 @@ class LinuxKernelRewriter final : public MetadataRewriter {
     if (Error E = processORCPostCFG())
       return E;
 
+    processAltInstructionsPostCFG();
+
     return Error::success();
   }
 
@@ -335,9 +337,6 @@ class LinuxKernelRewriter final : public MetadataRewriter {
     if (Error E = rewriteExceptionTable())
       return E;
 
-    if (Error E = rewriteAltInstructions())
-      return E;
-
     if (Error E = rewriteParaInstructions())
       return E;
 
@@ -1485,12 +1484,11 @@ Error LinuxKernelRewriter::tryReadAltInstructions(uint32_t AltInstFeatureSize,
   return Error::success();
 }
 
-Error LinuxKernelRewriter::rewriteAltInstructions() {
-  // Disable output of functions with alt instructions before the rewrite
-  // support is complete.
+void LinuxKernelRewriter::processAltInstructionsPostCFG() {
+  // Disable optimization and output of functions with alt instructions before
+  // the rewrite support is complete. Alt instructions can modify the control
+  // flow, hence we may end up deleting seemingly unreachable code.
   skipFunctionsWithAnnotation("AltInst");
-
-  return Error::success();
 }
 
 /// When the Linux kernel needs to handle an error associated with a given PCI
diff --git a/bolt/test/X86/linux-alt-instruction.s b/bolt/test/X86/linux-alt-instruction.s
index 66cd33a711b89..dc1b12a277785 100644
--- a/bolt/test/X86/linux-alt-instruction.s
+++ b/bolt/test/X86/linux-alt-instruction.s
@@ -6,7 +6,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags -nostdlib %t.o -o %t.exe \
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr,--no-pie
-# RUN: llvm-bolt %t.exe --print-normalized --alt-inst-feature-size=2 -o %t.out \
+# RUN: llvm-bolt %t.exe --print-cfg --alt-inst-feature-size=2 -o %t.out \
 # RUN:   | FileCheck %s
 
 ## Older kernels used to have padlen field in alt_instr. Check compatibility.
@@ -15,7 +15,7 @@
 # RUN:   %s -o %t.padlen.o
 # RUN: %clang %cflags -nostdlib %t.padlen.o -o %t.padlen.exe \
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr,--no-pie
-# RUN: llvm-bolt %t.padlen.exe --print-normalized --alt-inst-has-padlen -o %t.padlen.out \
+# RUN: llvm-bolt %t.padlen.exe --print-cfg --alt-inst-has-padlen -o %t.padlen.out \
 # RUN:   | FileCheck %s
 
 ## Check with a larger size of "feature" field in alt_instr.
@@ -24,7 +24,7 @@
 # RUN:   --defsym FEATURE_SIZE_4=1 %s -o %t.fs4.o
 # RUN: %clang %cflags -nostdlib %t.fs4.o -o %t.fs4.exe \
 # RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr,--no-pie
-# RUN: llvm-bolt %t.fs4.exe --print-normalized --alt-inst-feature-size=4 -o %t.fs4.out \
+# RUN: llvm-bolt %t.fs4.exe --print-cfg --alt-inst-feature-size=4 -o %t.fs4.out \
 # RUN:   | FileCheck %s
 
 ## Check that out-of-bounds read is handled properly.
@@ -33,9 +33,9 @@
 
 ## Check that BOLT automatically detects structure fields in .altinstructions.
 
-# RUN: llvm-bolt %t.exe --print-normalized -o %t.out | FileCheck %s
-# RUN: llvm-bolt %t.exe --print-normalized -o %t.padlen.out | FileCheck %s
-# RUN: llvm-bolt %t.exe --print-normalized -o %t.fs4.out | FileCheck %s
+# RUN: llvm-bolt %t.exe --print-cfg -o %t.out | FileCheck %s
+# RUN: llvm-bolt %t.exe --print-cfg -o %t.padlen.out | FileCheck %s
+# RUN: llvm-bolt %t.exe --print-cfg -o %t.fs4.out | FileCheck %s
 
 # CHECK:      BOLT-INFO: Linux kernel binary detected
 # CHECK:      BOLT-INFO: parsed 2 alternative instruction entries

>From 2e9abce42878371de98c331be36e9b05afac6c36 Mon Sep 17 00:00:00 2001
From: Sayhaan Siddiqui <sayhaan at meta.com>
Date: Mon, 17 Jun 2024 10:16:44 -0700
Subject: [PATCH 7/8] [BOLT][DWARF] Refactor legacy ranges writers

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: https://phabricator.intern.facebook.com/D58679290
---
 bolt/include/bolt/Core/DebugData.h            |  12 ++
 bolt/include/bolt/Rewrite/DWARFRewriter.h     |   4 +
 bolt/lib/Core/DebugData.cpp                   |   6 +
 bolt/lib/Rewrite/DWARFRewriter.cpp            | 132 +++++++++++-------
 bolt/test/X86/debug-fission-single-convert.s  |   6 +-
 bolt/test/X86/dwarf4-df-dualcu.test           |  40 +++---
 .../X86/dwarf4-df-input-lowpc-ranges-cus.test |  80 ++++++-----
 .../X86/dwarf4-df-input-lowpc-ranges.test     |  37 ++---
 8 files changed, 190 insertions(+), 127 deletions(-)

diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 585bafa088849..144433ac78a37 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -210,6 +210,15 @@ class DebugRangesSectionWriter {
   static bool classof(const DebugRangesSectionWriter *Writer) {
     return Writer->getKind() == RangesWriterKind::DebugRangesWriter;
   }
+  
+  /// Append a range to the main buffer.
+  void appendToRangeBuffer(const DebugBufferVector &CUBuffer);
+
+  /// Sets Unit DIE to be updated for CU.
+  void setDie(DIE *Die) { this->Die = Die; }
+
+  /// Returns Unit DIE to be updated for CU.
+  DIE *getDie() const { return Die; }
 
   /// Writes out range lists for a current CU being processed.
   void virtual finalizeSection(){};
@@ -232,6 +241,9 @@ class DebugRangesSectionWriter {
   static constexpr uint64_t EmptyRangesOffset{0};
 
 private:
+  /// Stores Unit DIE to be updated for CU.
+  DIE *Die{0};
+
   RangesWriterKind Kind;
 };
 
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index 3cc9d823c815b..7ad71579d3f18 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -90,6 +90,10 @@ class DWARFRewriter {
   /// Store Rangelists writer for each DWO CU.
   RangeListsDWOWriers RangeListsWritersByCU;
 
+  /// Stores ranges writer for each DWO CU.
+  std::unordered_map<uint64_t, std::unique_ptr<DebugRangesSectionWriter>>
+      LegacyRangesWritersByCU;
+
   std::mutex LocListDebugInfoPatchesMutex;
 
   /// Dwo id specific its RangesBase.
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index f502a50312470..08d4c45aac791 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -177,6 +177,12 @@ uint64_t DebugRangesSectionWriter::getSectionOffset() {
   return SectionOffset;
 }
 
+void DebugRangesSectionWriter::appendToRangeBuffer(
+    const DebugBufferVector &CUBuffer) {
+  *RangesStream << CUBuffer;
+  SectionOffset = RangesBuffer->size();
+}
+
 DebugAddrWriter *DebugRangeListsSectionWriter::AddrWriter = nullptr;
 
 uint64_t DebugRangeListsSectionWriter::addRanges(
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 7b62999dfb2b6..c995df57ec276 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -648,6 +648,15 @@ void DWARFRewriter::updateDebugInfo() {
 
     } else {
       LocListWritersByCU[CUIndex] = std::make_unique<DebugLocWriter>();
+      if (std::optional<uint64_t> DWOId = CU.getDWOId()) {
+        assert(LegacyRangesWritersByCU.count(*DWOId) == 0 &&
+               "LegacyRangeLists writer for DWO unit already exists.");
+        auto LegacyRangesSectionWriterByCU =
+            std::make_unique<DebugRangesSectionWriter>();
+        LegacyRangesSectionWriterByCU->initSection(CU);
+        LegacyRangesWritersByCU[*DWOId] =
+            std::move(LegacyRangesSectionWriterByCU);
+      }
     }
     return LocListWritersByCU[CUIndex++].get();
   };
@@ -695,6 +704,7 @@ void DWARFRewriter::updateDebugInfo() {
       if (Unit->getVersion() >= 5) {
         TempRangesSectionWriter = RangeListsWritersByCU[*DWOId].get();
       } else {
+        TempRangesSectionWriter = LegacyRangesWritersByCU[*DWOId].get();
         RangesBase = RangesSectionWriter->getSectionOffset();
         setDwoRangesBase(*DWOId, *RangesBase);
       }
@@ -1276,9 +1286,17 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
     }
 
     if (RangesBaseInfo) {
-      DIEBldr.replaceValue(&Die, RangesBaseInfo.getAttribute(),
-                           RangesBaseInfo.getForm(),
-                           DIEInteger(static_cast<uint32_t>(*RangesBase)));
+      if (RangesBaseInfo.getAttribute() == dwarf::DW_AT_GNU_ranges_base) {
+        auto RangesWriterIterator =
+            LegacyRangesWritersByCU.find(*Unit.getDWOId());
+        assert(RangesWriterIterator != LegacyRangesWritersByCU.end() &&
+               "RangesWriter does not exist for DWOId");
+        RangesWriterIterator->second->setDie(&Die);
+      } else {
+        DIEBldr.replaceValue(&Die, RangesBaseInfo.getAttribute(),
+                             RangesBaseInfo.getForm(),
+                             DIEInteger(static_cast<uint32_t>(*RangesBase)));
+      }
       RangesBase = std::nullopt;
     }
   }
@@ -1296,20 +1314,12 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
         RangesAttrInfo.getForm() == dwarf::DW_FORM_sec_offset)
       NeedConverted = true;
 
-    uint64_t CurRangeBase = 0;
-    if (Unit.isDWOUnit()) {
-      if (std::optional<uint64_t> DWOId = Unit.getDWOId())
-        CurRangeBase = getDwoRangesBase(*DWOId);
-      else
-        errs() << "BOLT-WARNING: [internal-dwarf-error]: DWOId is not found "
-                  "for DWO Unit.";
-    }
     if (NeedConverted || RangesAttrInfo.getForm() == dwarf::DW_FORM_rnglistx)
       DIEBldr.replaceValue(&Die, dwarf::DW_AT_ranges, dwarf::DW_FORM_rnglistx,
                            DIEInteger(DebugRangesOffset));
     else
       DIEBldr.replaceValue(&Die, dwarf::DW_AT_ranges, RangesAttrInfo.getForm(),
-                           DIEInteger(DebugRangesOffset - CurRangeBase));
+                           DIEInteger(DebugRangesOffset));
 
     if (!RangesBase) {
       if (LowPCAttrInfo &&
@@ -1326,15 +1336,21 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
 
     // If we are at this point we are in the CU/Skeleton CU, and
     // DW_AT_GNU_ranges_base or DW_AT_rnglists_base doesn't exist.
-    if (Unit.getVersion() <= 4)
+    if (Unit.getVersion() <= 4) {
       DIEBldr.addValue(&Die, dwarf::DW_AT_GNU_ranges_base, dwarf::DW_FORM_data4,
-                       DIEInteger(*RangesBase));
-    else if (Unit.getVersion() == 5)
+                       DIEInteger(INT_MAX));
+      auto RangesWriterIterator =
+          LegacyRangesWritersByCU.find(*Unit.getDWOId());
+      assert(RangesWriterIterator != LegacyRangesWritersByCU.end() &&
+             "RangesWriter does not exist for DWOId");
+      RangesWriterIterator->second->setDie(&Die);
+    } else if (Unit.getVersion() == 5) {
       DIEBldr.addValue(&Die, dwarf::DW_AT_rnglists_base,
                        dwarf::DW_FORM_sec_offset, DIEInteger(*RangesBase));
-    else
+    } else {
       DIEBldr.addValue(&Die, dwarf::DW_AT_rnglists_base,
                        dwarf::DW_FORM_sec_offset, DIEInteger(*RangesBase));
+    }
     return;
   }
 
@@ -1612,6 +1628,30 @@ void DWARFRewriter::finalizeCompileUnits(DIEBuilder &DIEBlder,
                                          DIEStreamer &Streamer,
                                          CUOffsetMap &CUMap,
                                          const std::list<DWARFUnit *> &CUs) {
+  for (DWARFUnit *CU : CUs) {
+    if (CU->getVersion() != 4)
+      continue;
+    std::optional<uint64_t> DWOId = CU->getDWOId();
+    if (!DWOId)
+      continue;
+    auto RangesWriterIterator = LegacyRangesWritersByCU.find(*DWOId);
+    assert(RangesWriterIterator != LegacyRangesWritersByCU.end() &&
+           "RangesWriter does not exist for DWOId");
+    std::unique_ptr<DebugRangesSectionWriter> &LegacyRangesWriter =
+        RangesWriterIterator->second;
+    std::optional<DIE *> Die = LegacyRangesWriter->getDie();
+    if (!Die || !Die.value())
+      continue;
+    DIEValue DvalGNUBase =
+        Die.value()->findAttribute(dwarf::DW_AT_GNU_ranges_base);
+    assert(DvalGNUBase && "GNU_ranges_base attribute does not exist for DWOId");
+    DIEBlder.replaceValue(
+        Die.value(), dwarf::DW_AT_GNU_ranges_base, DvalGNUBase.getForm(),
+        DIEInteger(LegacyRangesSectionWriter->getSectionOffset()));
+    std::unique_ptr<DebugBufferVector> RangesWritersContents =
+        LegacyRangesWriter->releaseBuffer();
+    LegacyRangesSectionWriter->appendToRangeBuffer(*RangesWritersContents);
+  }
   DIEBlder.generateAbbrevs();
   DIEBlder.finish();
   // generate debug_info and CUMap
@@ -2270,7 +2310,6 @@ void DWARFRewriter::convertToRangesPatchDebugInfo(
     DWARFUnit &Unit, DIEBuilder &DIEBldr, DIE &Die,
     uint64_t RangesSectionOffset, DIEValue &LowPCAttrInfo,
     DIEValue &HighPCAttrInfo, std::optional<uint64_t> RangesBase) {
-  uint32_t BaseOffset = 0;
   dwarf::Form LowForm = LowPCAttrInfo.getForm();
   dwarf::Attribute RangeBaseAttribute = dwarf::DW_AT_GNU_ranges_base;
   dwarf::Form RangesForm = dwarf::DW_FORM_sec_offset;
@@ -2285,45 +2324,40 @@ void DWARFRewriter::convertToRangesPatchDebugInfo(
                    Die.getTag() == dwarf::DW_TAG_skeleton_unit;
   if (!IsUnitDie)
     DIEBldr.deleteValue(&Die, LowPCAttrInfo.getAttribute());
-  // In DWARF4 for DW_AT_low_pc in binary DW_FORM_addr is used. In the DWO
-  // section DW_FORM_GNU_addr_index is used. So for if we are converting
-  // DW_AT_low_pc/DW_AT_high_pc and see DW_FORM_GNU_addr_index. We are
-  // converting in DWO section, and DW_AT_ranges [DW_FORM_sec_offset] is
-  // relative to DW_AT_GNU_ranges_base.
-  if (LowForm == dwarf::DW_FORM_GNU_addr_index) {
-    // Ranges are relative to DW_AT_GNU_ranges_base.
-    uint64_t CurRangeBase = 0;
-    if (std::optional<uint64_t> DWOId = Unit.getDWOId()) {
-      CurRangeBase = getDwoRangesBase(*DWOId);
-    }
-    BaseOffset = CurRangeBase;
-  } else {
-    // In DWARF 5 we can have DW_AT_low_pc either as DW_FORM_addr, or
-    // DW_FORM_addrx. Former is when DW_AT_rnglists_base is present. Latter is
-    // when it's absent.
-    if (IsUnitDie) {
-      if (LowForm == dwarf::DW_FORM_addrx) {
-        const uint32_t Index = AddrWriter->getIndexFromAddress(0, Unit);
-        DIEBldr.replaceValue(&Die, LowPCAttrInfo.getAttribute(),
-                             LowPCAttrInfo.getForm(), DIEInteger(Index));
-      } else {
-        DIEBldr.replaceValue(&Die, LowPCAttrInfo.getAttribute(),
-                             LowPCAttrInfo.getForm(), DIEInteger(0));
-      }
+
+  // In DWARF 5 we can have DW_AT_low_pc either as DW_FORM_addr, or
+  // DW_FORM_addrx. Former is when DW_AT_rnglists_base is present. Latter is
+  // when it's absent.
+  if (IsUnitDie) {
+    if (LowForm == dwarf::DW_FORM_addrx) {
+      const uint32_t Index = AddrWriter->getIndexFromAddress(0, Unit);
+      DIEBldr.replaceValue(&Die, LowPCAttrInfo.getAttribute(),
+                           LowPCAttrInfo.getForm(), DIEInteger(Index));
+    } else {
+      DIEBldr.replaceValue(&Die, LowPCAttrInfo.getAttribute(),
+                           LowPCAttrInfo.getForm(), DIEInteger(0));
     }
-    // Original CU didn't have DW_AT_*_base. We converted it's children (or
-    // dwo), so need to insert it into CU.
-    if (RangesBase)
+  }
+  // Original CU didn't have DW_AT_*_base. We converted it's children (or
+  // dwo), so need to insert it into CU.
+  if (RangesBase) {
+    if (Unit.getVersion() >= 5) {
       DIEBldr.addValue(&Die, RangeBaseAttribute, dwarf::DW_FORM_sec_offset,
                        DIEInteger(*RangesBase));
+    } else {
+      DIEBldr.addValue(&Die, RangeBaseAttribute, dwarf::DW_FORM_sec_offset,
+                       DIEInteger(INT_MAX));
+      auto RangesWriterIterator =
+          LegacyRangesWritersByCU.find(*Unit.getDWOId());
+      assert(RangesWriterIterator != LegacyRangesWritersByCU.end() &&
+             "RangesWriter does not exist for DWOId");
+      RangesWriterIterator->second->setDie(&Die);
+    }
   }
 
-  uint64_t RangeAttrVal = RangesSectionOffset - BaseOffset;
-  if (Unit.getVersion() >= 5)
-    RangeAttrVal = RangesSectionOffset;
   // HighPC was conveted into DW_AT_ranges.
   // For DWARF5 we only access ranges through index.
 
   DIEBldr.replaceValue(&Die, HighPCAttrInfo.getAttribute(), dwarf::DW_AT_ranges,
-                       RangesForm, DIEInteger(RangeAttrVal));
+                       RangesForm, DIEInteger(RangesSectionOffset));
 }
diff --git a/bolt/test/X86/debug-fission-single-convert.s b/bolt/test/X86/debug-fission-single-convert.s
index 28fcb6686e0a2..4cd881740b2f8 100644
--- a/bolt/test/X86/debug-fission-single-convert.s
+++ b/bolt/test/X86/debug-fission-single-convert.s
@@ -31,11 +31,11 @@
 # CHECK-DWO-DWO: 00000010
 # CHECK-DWO-DWO: 00000050
 # CHECK-DWO-DWO: DW_TAG_subprogram
-# CHECK-DWO-DWO-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000000
+# CHECK-DWO-DWO-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000010
 # CHECK-DWO-DWO: DW_TAG_subprogram
-# CHECK-DWO-DWO-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000020
+# CHECK-DWO-DWO-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000030
 # CHECK-DWO-DWO: DW_TAG_subprogram
-# CHECK-DWO-DWO-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000040
+# CHECK-DWO-DWO-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000050
 
 # CHECK-ADDR-SEC: .debug_addr contents:
 # CHECK-ADDR-SEC: 0x00000000: Addrs: [
diff --git a/bolt/test/X86/dwarf4-df-dualcu.test b/bolt/test/X86/dwarf4-df-dualcu.test
index b690623b70d83..fb328eb1872e0 100644
--- a/bolt/test/X86/dwarf4-df-dualcu.test
+++ b/bolt/test/X86/dwarf4-df-dualcu.test
@@ -37,36 +37,38 @@
 
 ; BOLT: .debug_ranges
 ; BOLT-NEXT: 00000000 <End of list>
-; BOLT-NEXT: 00000010 [[#%.16x,ADDR:]] [[#%.16x,ADDRB:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR1:]] [[#%.16x,ADDRB1:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR2:]] [[#%.16x,ADDRB2:]]
 ; BOLT-NEXT: 00000010 <End of list>
-; BOLT-NEXT: 00000030 [[#%.16x,ADDR1:]] [[#%.16x,ADDR1B:]]
-; BOLT-NEXT: 00000030 <End of list>
-; BOLT-NEXT: 00000050 [[#%.16x,ADDR2:]] [[#%.16x,ADDR2B:]]
-; BOLT-NEXT: 00000050 [[#%.16x,ADDR3:]] [[#%.16x,ADDR3B:]]
+; BOLT-NEXT: 00000040 <End of list>
+; BOLT-NEXT: 00000050 [[#%.16x,ADDR1:]] [[#%.16x,ADDRB1:]]
 ; BOLT-NEXT: 00000050 <End of list>
-; BOLT-NEXT: 00000080 [[#%.16x,ADDR4:]] [[#%.16x,ADDR4B:]]
-; BOLT-NEXT: 00000080 <End of list>
-; BOLT-NEXT: 000000a0 [[#%.16x,ADDR5:]] [[#%.16x,ADDR5B:]]
-; BOLT-NEXT: 000000a0 <End of list>
+; BOLT-NEXT: 00000070 [[#%.16x,ADDR2:]] [[#%.16x,ADDRB2:]]
+; BOLT-NEXT: 00000070 <End of list>
+; BOLT-NEXT: 00000090 [[#%.16x,ADDR3:]] [[#%.16x,ADDRB3:]]
+; BOLT-NEXT: 00000090 <End of list>
+; BOLT-NEXT: 000000b0 <End of list>
+; BOLT-NEXT: 000000c0 [[#%.16x,ADDR3:]] [[#%.16x,ADDRB3:]]
+; BOLT-NEXT: 000000c0 <End of list>
 
 ; BOLT: DW_TAG_compile_unit
 ; BOLT: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000016] = "main.dwo.dwo")
 ; BOLT-NEXT: DW_AT_GNU_dwo_id
 ; BOLT-NEXT: DW_AT_low_pc [DW_FORM_addr] (0x0000000000000000)
-; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000050
-; BOLT-NEXT: [0x[[#ADDR2]], 0x[[#ADDR2B]])
-; BOLT-NEXT: [0x[[#ADDR3]], 0x[[#ADDR3B]]))
+; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000010
+; BOLT-NEXT: [0x[[#ADDR1]], 0x[[#ADDRB1]])
+; BOLT-NEXT: [0x[[#ADDR2]], 0x[[#ADDRB2]]))
 ; BOLT-NEXT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]  (0x00000000)
-; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x00000010)
+; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x00000040)
 ; BOLT-NEXT: Compile
 ; BOLT: DW_TAG_compile_unit
 ; BOLT: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000023] = "helper.dwo.dwo")
 ; BOLT-NEXT: DW_AT_GNU_dwo_id
 ; BOLT-NEXT: DW_AT_low_pc [DW_FORM_addr] (0x0000000000000000)
-; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x000000a0
-; BOLT-NEXT: [0x[[#ADDR5]], 0x[[#ADDR5B]])
+; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000090
+; BOLT-NEXT: [0x[[#ADDR3]], 0x[[#ADDRB3]])
 ; BOLT-NEXT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]  (0x00000010)
-; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x00000080)
+; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x000000b0)
 
 ; PRE-BOLT-DWO-MAIN: version = 0x0004
 ; PRE-BOLT-DWO-MAIN: DW_TAG_compile_unit
@@ -113,13 +115,13 @@
 ; BOLT-DWO-MAIN-NEXT: DW_AT_decl_line
 ; BOLT-DWO-MAIN-NEXT: DW_AT_location [DW_FORM_exprloc]	(DW_OP_GNU_addr_index 0x1)
 ; BOLT-DWO-MAIN: DW_TAG_subprogram [4]
-; BOLT-DWO-MAIN-NEXT: DW_AT_ranges [DW_FORM_sec_offset]	(0x00000000
+; BOLT-DWO-MAIN-NEXT: DW_AT_ranges [DW_FORM_sec_offset]	(0x00000010
 ; BOLT-DWO-MAIN-NEXT: )
 ; BOLT-DWO-MAIN-NEXT: DW_AT_frame_base
 ; BOLT-DWO-MAIN-NEXT: DW_AT_linkage_name [DW_FORM_GNU_str_index]	(indexed (00000003) string = "_Z3usePiS_")
 ; BOLT-DWO-MAIN-NEXT: DW_AT_name [DW_FORM_GNU_str_index]	(indexed (00000004) string = "use")
 ; BOLT-DWO-MAIN: DW_TAG_subprogram [6]
-; BOLT-DWO-MAIN-NEXT: DW_AT_ranges [DW_FORM_sec_offset]	(0x00000020
+; BOLT-DWO-MAIN-NEXT: DW_AT_ranges [DW_FORM_sec_offset]	(0x00000030
 ; BOLT-DWO-MAIN-NEXT: )
 ; BOLT-DWO-MAIN-NEXT: DW_AT_frame_base [DW_FORM_exprloc]	(DW_OP_reg6 RBP)
 ; BOLT-DWO-MAIN-NEXT: DW_AT_name [DW_FORM_GNU_str_index]	(indexed (00000005) string = "main")
@@ -160,4 +162,4 @@
 ; BOLT-DWO-HELPER-NEXT: DW_AT_decl_line
 ; BOLT-DWO-HELPER-NEXT: DW_AT_location [DW_FORM_exprloc]	(DW_OP_GNU_addr_index 0x1)
 ; BOLT-DWO-HELPER: DW_TAG_subprogram [4]
-; BOLT-DWO-HELPER-NEXT: DW_AT_ranges [DW_FORM_sec_offset]	(0x00000000
+; BOLT-DWO-HELPER-NEXT: DW_AT_ranges [DW_FORM_sec_offset]	(0x00000010
diff --git a/bolt/test/X86/dwarf4-df-input-lowpc-ranges-cus.test b/bolt/test/X86/dwarf4-df-input-lowpc-ranges-cus.test
index c9abd02bbb7d9..cf9357d5f3c59 100644
--- a/bolt/test/X86/dwarf4-df-input-lowpc-ranges-cus.test
+++ b/bolt/test/X86/dwarf4-df-input-lowpc-ranges-cus.test
@@ -17,45 +17,47 @@
 
 ; BOLT: .debug_ranges
 ; BOLT-NEXT: 00000000 <End of list>
-; BOLT-NEXT: 00000010
-; BOLT-NEXT: 00000010
-; BOLT-NEXT: 00000010
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR1:]] [[#%.16x,ADDRB1:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR2:]] [[#%.16x,ADDRB2:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR3:]] [[#%.16x,ADDRB3:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR4:]] [[#%.16x,ADDRB4:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR5:]] [[#%.16x,ADDRB5:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR6:]] [[#%.16x,ADDRB6:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR7:]] [[#%.16x,ADDRB7:]]
 ; BOLT-NEXT: 00000010 <End of list>
-; BOLT-NEXT: 00000050
-; BOLT-NEXT: 00000050
-; BOLT-NEXT: 00000050
-; BOLT-NEXT: 00000050 <End of list>
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR1:]] [[#%.16x,ADDRB1:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR2:]] [[#%.16x,ADDRB2:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR3:]] [[#%.16x,ADDRB3:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR4:]] [[#%.16x,ADDRB4:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR5:]] [[#%.16x,ADDRB5:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR6:]] [[#%.16x,ADDRB6:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR7:]] [[#%.16x,ADDRB7:]]
 ; BOLT-NEXT: 00000090 <End of list>
-; BOLT-NEXT: 00000110
-; BOLT-NEXT: 00000110
-; BOLT-NEXT: 00000110
-; BOLT-NEXT: 00000110 <End of list>
-; BOLT-NEXT: 00000150
-; BOLT-NEXT: 00000150
-; BOLT-NEXT: 00000150
-; BOLT-NEXT: 00000150 <End of list>
-; BOLT-NEXT: 00000190 [[#%.16x,ADDR8:]] [[#%.16x,ADDRB8:]]
-; BOLT-NEXT: 00000190 [[#%.16x,ADDR9:]] [[#%.16x,ADDRB9:]]
-; BOLT-NEXT: 00000190 [[#%.16x,ADDR10:]] [[#%.16x,ADDRB10:]]
-; BOLT-NEXT: 00000190 [[#%.16x,ADDR11:]] [[#%.16x,ADDRB11:]]
-; BOLT-NEXT: 00000190 [[#%.16x,ADDR12:]] [[#%.16x,ADDRB12:]]
-; BOLT-NEXT: 00000190 [[#%.16x,ADDR13:]] [[#%.16x,ADDRB13:]]
-; BOLT-NEXT: 00000190 [[#%.16x,ADDR14:]] [[#%.16x,ADDRB14:]]
-; BOLT-NEXT: 00000190 <End of list>
+; BOLT-NEXT: 000000a0 [[#%.16x,ADDR1:]] [[#%.16x,ADDRB1:]]
+; BOLT-NEXT: 000000a0 [[#%.16x,ADDR2:]] [[#%.16x,ADDRB2:]]
+; BOLT-NEXT: 000000a0 [[#%.16x,ADDR3:]] [[#%.16x,ADDRB3:]]
+; BOLT-NEXT: 000000a0 <End of list>
+; BOLT-NEXT: 000000e0 [[#%.16x,ADDR5:]] [[#%.16x,ADDRB5:]]
+; BOLT-NEXT: 000000e0 [[#%.16x,ADDR6:]] [[#%.16x,ADDRB6:]]
+; BOLT-NEXT: 000000e0 [[#%.16x,ADDR7:]] [[#%.16x,ADDRB7:]]
+; BOLT-NEXT: 000000e0 <End of list>
+; BOLT-NEXT: 00000120 [[#%.16x,ADDR8:]] [[#%.16x,ADDRB8:]]
+; BOLT-NEXT: 00000120 [[#%.16x,ADDR9:]] [[#%.16x,ADDRB9:]]
+; BOLT-NEXT: 00000120 [[#%.16x,ADDR10:]] [[#%.16x,ADDRB10:]]
+; BOLT-NEXT: 00000120 [[#%.16x,ADDR11:]] [[#%.16x,ADDRB11:]]
+; BOLT-NEXT: 00000120 [[#%.16x,ADDR12:]] [[#%.16x,ADDRB12:]]
+; BOLT-NEXT: 00000120 [[#%.16x,ADDR13:]] [[#%.16x,ADDRB13:]]
+; BOLT-NEXT: 00000120 [[#%.16x,ADDR14:]] [[#%.16x,ADDRB14:]]
+; BOLT-NEXT: 00000120 <End of list>
+; BOLT-NEXT: 000001a0 <End of list>
+; BOLT-NEXT: 000001b0 [[#%.16x,ADDR8:]] [[#%.16x,ADDRB8:]]
+; BOLT-NEXT: 000001b0 [[#%.16x,ADDR9:]] [[#%.16x,ADDRB9:]]
+; BOLT-NEXT: 000001b0 [[#%.16x,ADDR10:]] [[#%.16x,ADDRB10:]]
+; BOLT-NEXT: 000001b0 <End of list>
+; BOLT-NEXT: 000001f0 [[#%.16x,ADDR12:]] [[#%.16x,ADDRB12:]]
+; BOLT-NEXT: 000001f0 [[#%.16x,ADDR13:]] [[#%.16x,ADDRB13:]]
+; BOLT-NEXT: 000001f0 [[#%.16x,ADDR14:]] [[#%.16x,ADDRB14:]]
+; BOLT-NEXT: 000001f0 <End of list>
 
 ; BOLT: DW_TAG_compile_unit
 ; BOLT: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x{{[0-9a-fA-F]+}}] = "main.dwo.dwo")
 ; BOLT-NEXT: DW_AT_GNU_dwo_id
-; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x00000010)
+; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x00000090)
 ; BOLT-NEXT: DW_AT_low_pc [DW_FORM_addr] (0x0000000000000000)
-; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000090
+; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000010
 ; BOLT-NEXT: [0x[[#ADDR1]], 0x[[#ADDRB1]])
 ; BOLT-NEXT: [0x[[#ADDR2]], 0x[[#ADDRB2]])
 ; BOLT-NEXT: [0x[[#ADDR3]], 0x[[#ADDRB3]])
@@ -64,13 +66,14 @@
 ; BOLT-NEXT: [0x[[#ADDR6]], 0x[[#ADDRB6]])
 ; BOLT-NEXT: [0x[[#ADDR7]], 0x[[#ADDRB7]])
 ; BOLT-NEXT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]  (0x00000000)
+; BOLT-NEXT: Compile Unit
 
 ; BOLT: DW_TAG_compile_unit
 ; BOLT: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x{{[0-9a-fA-F]+}}] = "mainOther.dwo.dwo")
 ; BOLT-NEXT: DW_AT_GNU_dwo_id
-; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x00000110)
+; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x000001a0)
 ; BOLT-NEXT: DW_AT_low_pc [DW_FORM_addr] (0x0000000000000000)
-; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000190
+; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000120
 ; BOLT-NEXT: [0x[[#ADDR8]], 0x[[#ADDRB8]])
 ; BOLT-NEXT: [0x[[#ADDR9]], 0x[[#ADDRB9]])
 ; BOLT-NEXT: [0x[[#ADDR10]], 0x[[#ADDRB10]])
@@ -79,19 +82,20 @@
 ; BOLT-NEXT: [0x[[#ADDR13]], 0x[[#ADDRB13]])
 ; BOLT-NEXT: [0x[[#ADDR14]], 0x[[#ADDRB14]])
 ; BOLT-NEXT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]  (0x00000018)
+; BOLT: {{^$}}
 
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
-; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000000
+; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000010
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
-; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000040
+; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000050
 
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
-; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000000
+; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000010
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
-; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000040
+; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000050
diff --git a/bolt/test/X86/dwarf4-df-input-lowpc-ranges.test b/bolt/test/X86/dwarf4-df-input-lowpc-ranges.test
index 276bea4ba0c1c..ab4353a282475 100644
--- a/bolt/test/X86/dwarf4-df-input-lowpc-ranges.test
+++ b/bolt/test/X86/dwarf4-df-input-lowpc-ranges.test
@@ -15,29 +15,30 @@
 
 ; BOLT: .debug_ranges
 ; BOLT-NEXT: 00000000 <End of list>
-; BOLT-NEXT: 00000010
-; BOLT-NEXT: 00000010
-; BOLT-NEXT: 00000010
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR1:]] [[#%.16x,ADDRB1:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR2:]] [[#%.16x,ADDRB2:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR3:]] [[#%.16x,ADDRB3:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR4:]] [[#%.16x,ADDRB4:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR5:]] [[#%.16x,ADDRB5:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR6:]] [[#%.16x,ADDRB6:]]
+; BOLT-NEXT: 00000010 [[#%.16x,ADDR7:]] [[#%.16x,ADDRB7:]]
 ; BOLT-NEXT: 00000010 <End of list>
-; BOLT-NEXT: 00000050
-; BOLT-NEXT: 00000050
-; BOLT-NEXT: 00000050
-; BOLT-NEXT: 00000050 <End of list>
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR1:]] [[#%.16x,ADDRB1:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR2:]] [[#%.16x,ADDRB2:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR3:]] [[#%.16x,ADDRB3:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR4:]] [[#%.16x,ADDRB4:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR5:]] [[#%.16x,ADDRB5:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR6:]] [[#%.16x,ADDRB6:]]
-; BOLT-NEXT: 00000090 [[#%.16x,ADDR7:]] [[#%.16x,ADDRB7:]]
 ; BOLT-NEXT: 00000090 <End of list>
+; BOLT-NEXT: 000000a0 [[#%.16x,ADDR1:]] [[#%.16x,ADDRB1:]]
+; BOLT-NEXT: 000000a0 [[#%.16x,ADDR2:]] [[#%.16x,ADDRB2:]]
+; BOLT-NEXT: 000000a0 [[#%.16x,ADDR3:]] [[#%.16x,ADDRB3:]]
+; BOLT-NEXT: 000000a0 <End of list>
+; BOLT-NEXT: 000000e0 [[#%.16x,ADDR5:]] [[#%.16x,ADDRB5:]]
+; BOLT-NEXT: 000000e0 [[#%.16x,ADDR6:]] [[#%.16x,ADDRB6:]]
+; BOLT-NEXT: 000000e0 [[#%.16x,ADDR7:]] [[#%.16x,ADDRB7:]]
+; BOLT-NEXT: 000000e0 <End of list>
 
 ; BOLT: DW_TAG_compile_unit
 ; BOLT: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x{{[0-9a-fA-F]+}}] = "main.dwo.dwo")
 ; BOLT-NEXT: DW_AT_GNU_dwo_id
-; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x00000010)
+; BOLT-NEXT: DW_AT_GNU_ranges_base [DW_FORM_sec_offset]  (0x00000090)
 ; BOLT-NEXT: DW_AT_low_pc [DW_FORM_addr] (0x0000000000000000)
-; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000090
+; BOLT-NEXT: DW_AT_ranges [DW_FORM_sec_offset] (0x00000010
 ; BOLT-NEXT: [0x[[#ADDR1]], 0x[[#ADDRB1]])
 ; BOLT-NEXT: [0x[[#ADDR2]], 0x[[#ADDRB2]])
 ; BOLT-NEXT: [0x[[#ADDR3]], 0x[[#ADDRB3]])
@@ -48,9 +49,9 @@
 ; BOLT-NEXT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]  (0x00000000)
 
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
-; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000000
+; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000010
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
 ; BOLT-DWO-MAIN:        DW_TAG_subprogram
-; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000040
+; BOLT-DWO-MAIN-NEXT:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000050

>From efe2615bc90dbc08ec12512fae0fb60f9f7ab897 Mon Sep 17 00:00:00 2001
From: Sayhaan Siddiqui <sayhaan at meta.com>
Date: Mon, 8 Jul 2024 10:26:50 -0700
Subject: [PATCH 8/8] Refactor address writers

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: https://phabricator.intern.facebook.com/D59493903
---
 bolt/include/bolt/Core/DebugData.h        | 53 ++++++++++++-----
 bolt/include/bolt/Rewrite/DWARFRewriter.h |  5 ++
 bolt/lib/Core/DebugData.cpp               | 71 +++++++++++------------
 bolt/lib/Rewrite/DWARFRewriter.cpp        | 63 ++++++++++++--------
 4 files changed, 118 insertions(+), 74 deletions(-)

diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 144433ac78a37..05217c699f0dd 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -344,14 +344,34 @@ class DebugAddrWriter {
   uint32_t getIndexFromAddress(uint64_t Address, DWARFUnit &CU);
 
   /// Write out entries in to .debug_addr section for CUs.
-  virtual void update(DIEBuilder &DIEBlder, DWARFUnit &CUs);
+  virtual void update();
 
   /// Return buffer with all the entries in .debug_addr already writen out using
   /// update(...).
   virtual AddressSectionBuffer &finalize() { return *Buffer; }
 
   /// Returns False if .debug_addr section was created..
-  bool isInitialized() const { return !AddressMaps.empty(); }
+  bool isInitialized() { return Map.empty; }
+
+  /// Updates address base with the given Offset.
+  virtual void updateAddrBase(DIEBuilder &DIEBlder, DWARFUnit &CU,
+                              const uint64_t Offset);
+
+  /// Appends an AddressSectionBuffer to the address writer buffer for the given
+  /// CU.
+  void appendToAddressBuffer(const AddressSectionBuffer &Buffer) {
+    *AddressStream << Buffer;
+  }
+
+  /// Sets AddressByteSize for the CU.
+  void setAddressByteSize(uint8_t AddressByteSize) {
+    this->AddressByteSize = AddressByteSize;
+  }
+
+  /// Sets AddrOffsetSectionBase for the CU.
+  void setAddrOffsetSectionBase(std::optional<uint64_t> AddrOffsetSectionBase) {
+    this->AddrOffsetSectionBase = AddrOffsetSectionBase;
+  }
 
 protected:
   class AddressForDWOCU {
@@ -396,6 +416,8 @@ class DebugAddrWriter {
 
     void dump();
 
+    bool empty = false;
+
   private:
     AddressToIndexMap AddressToIndex;
     IndexToAddressMap IndexToAddress;
@@ -408,14 +430,16 @@ class DebugAddrWriter {
   }
 
   BinaryContext *BC;
-  /// Maps DWOID to AddressForDWOCU.
-  std::unordered_map<uint64_t, AddressForDWOCU> AddressMaps;
+  /// Address for the DWO CU associated with the address writer.
+  AddressForDWOCU Map;
+  uint8_t AddressByteSize;
+  std::optional<uint64_t> AddrOffsetSectionBase;
   /// Mutex used for parallel processing of debug info.
   std::mutex WriterMutex;
   std::unique_ptr<AddressSectionBuffer> Buffer;
   std::unique_ptr<raw_svector_ostream> AddressStream;
   /// Used to track sections that were not modified so that they can be re-used.
-  DenseMap<uint64_t, uint64_t> UnmodifiedAddressOffsets;
+  static DenseMap<uint64_t, uint64_t> UnmodifiedAddressOffsets;
 };
 
 class DebugAddrWriterDwarf5 : public DebugAddrWriter {
@@ -424,7 +448,10 @@ class DebugAddrWriterDwarf5 : public DebugAddrWriter {
   DebugAddrWriterDwarf5(BinaryContext *BC) : DebugAddrWriter(BC) {}
 
   /// Write out entries in to .debug_addr section for CUs.
-  virtual void update(DIEBuilder &DIEBlder, DWARFUnit &CUs) override;
+  virtual void update() override;
+
+  virtual void updateAddrBase(DIEBuilder &DIEBlder, DWARFUnit &CU,
+                              const uint64_t Offset) override;
 
 protected:
   /// Given DWARFUnit \p Unit returns either DWO ID or it's offset within
@@ -584,12 +611,10 @@ class DebugLoclistWriter : public DebugLocWriter {
 public:
   ~DebugLoclistWriter() {}
   DebugLoclistWriter() = delete;
-  DebugLoclistWriter(DWARFUnit &Unit, uint8_t DV, bool SD)
-      : DebugLocWriter(DV, LocWriterKind::DebugLoclistWriter), CU(Unit),
-        IsSplitDwarf(SD) {
-    assert(DebugLoclistWriter::AddrWriter &&
-           "Please use SetAddressWriter to initialize "
-           "DebugAddrWriter before instantiation.");
+  DebugLoclistWriter(DWARFUnit &Unit, uint8_t DV, bool SD,
+                     DebugAddrWriter *AddrW)
+      : DebugLocWriter(DV, LocWriterKind::DebugLoclistWriter),
+        AddrWriter(AddrW), CU(Unit), IsSplitDwarf(SD) {
     if (DwarfVersion >= 5) {
       LocBodyBuffer = std::make_unique<DebugBufferVector>();
       LocBodyStream = std::make_unique<raw_svector_ostream>(*LocBodyBuffer);
@@ -601,7 +626,7 @@ class DebugLoclistWriter : public DebugLocWriter {
     }
   }
 
-  static void setAddressWriter(DebugAddrWriter *AddrW) { AddrWriter = AddrW; }
+  void setAddressWriter(DebugAddrWriter *AddrW) { AddrWriter = AddrW; }
 
   /// Stores location lists internally to be written out during finalize phase.
   virtual void addList(DIEBuilder &DIEBldr, DIE &Die, DIEValue &AttrInfo,
@@ -631,7 +656,7 @@ class DebugLoclistWriter : public DebugLocWriter {
   /// Writes out locations in to a local buffer and applies debug info patches.
   void finalizeDWARF5(DIEBuilder &DIEBldr, DIE &Die);
 
-  static DebugAddrWriter *AddrWriter;
+  DebugAddrWriter *AddrWriter;
   DWARFUnit &CU;
   bool IsSplitDwarf{false};
   // Used for DWARF5 to store location lists before being finalized.
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index 7ad71579d3f18..de0d66ca8ffda 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -94,6 +94,10 @@ class DWARFRewriter {
   std::unordered_map<uint64_t, std::unique_ptr<DebugRangesSectionWriter>>
       LegacyRangesWritersByCU;
 
+  /// Stores address writer for each CU.
+  std::unordered_map<uint64_t, std::unique_ptr<DebugAddrWriter>>
+      AddressWritersByCU;
+
   std::mutex LocListDebugInfoPatchesMutex;
 
   /// Dwo id specific its RangesBase.
@@ -116,6 +120,7 @@ class DWARFRewriter {
   void updateUnitDebugInfo(DWARFUnit &Unit, DIEBuilder &DIEBldr,
                            DebugLocWriter &DebugLocWriter,
                            DebugRangesSectionWriter &RangesSectionWriter,
+                           DebugAddrWriter &AddressWriter,
                            std::optional<uint64_t> RangesBase = std::nullopt);
 
   /// Patches the binary for an object's address ranges to be updated.
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index 08d4c45aac791..52df5d5e55707 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -393,6 +393,7 @@ void DebugARangesSectionWriter::writeARangesSection(
 DebugAddrWriter::DebugAddrWriter(BinaryContext *BC) : BC(BC) {
   Buffer = std::make_unique<AddressSectionBuffer>();
   AddressStream = std::make_unique<raw_svector_ostream>(*Buffer);
+  Map = AddressForDWOCU();
 }
 
 void DebugAddrWriter::AddressForDWOCU::dump() {
@@ -405,11 +406,8 @@ void DebugAddrWriter::AddressForDWOCU::dump() {
 }
 uint32_t DebugAddrWriter::getIndexFromAddress(uint64_t Address, DWARFUnit &CU) {
   std::lock_guard<std::mutex> Lock(WriterMutex);
-  const uint64_t CUID = getCUID(CU);
-  if (!AddressMaps.count(CUID))
-    AddressMaps[CUID] = AddressForDWOCU();
-
-  AddressForDWOCU &Map = AddressMaps[CUID];
+  if (Map.begin() == Map.end())
+    Map.empty = true;
   auto Entry = Map.find(Address);
   if (Entry == Map.end()) {
     auto Index = Map.getNextIndex();
@@ -449,25 +447,20 @@ static void updateAddressBase(DIEBuilder &DIEBlder, DebugAddrWriter &AddrWriter,
   }
 }
 
-void DebugAddrWriter::update(DIEBuilder &DIEBlder, DWARFUnit &CU) {
-  // Handling the case where debug information is a mix of Debug fission and
-  // monolithic.
-  if (!CU.getDWOId())
-    return;
-  const uint64_t CUID = getCUID(CU);
-  auto AM = AddressMaps.find(CUID);
-  // Adding to map even if it did not contribute to .debug_addr.
-  // The Skeleton CU might still have DW_AT_GNU_addr_base.
-  uint64_t Offset = Buffer->size();
-  // If does not exist this CUs DWO section didn't contribute to .debug_addr.
-  if (AM == AddressMaps.end())
+void DebugAddrWriter::updateAddrBase(DIEBuilder &DIEBlder, DWARFUnit &CU,
+                                     const uint64_t Offset) {
+  updateAddressBase(DIEBlder, *this, CU, Offset);
+}
+
+void DebugAddrWriter::update() {
+  if (Map.indexToAddressBegin() == Map.indexToAdddessEnd())
     return;
-  std::vector<IndexAddressPair> SortedMap(AM->second.indexToAddressBegin(),
-                                          AM->second.indexToAdddessEnd());
+  std::vector<IndexAddressPair> SortedMap(Map.indexToAddressBegin(),
+                                          Map.indexToAdddessEnd());
   // Sorting address in increasing order of indices.
   llvm::sort(SortedMap, llvm::less_first());
 
-  uint8_t AddrSize = CU.getAddressByteSize();
+  uint8_t AddrSize = AddressByteSize;
   uint32_t Counter = 0;
   auto WriteAddress = [&](uint64_t Address) -> void {
     ++Counter;
@@ -490,10 +483,23 @@ void DebugAddrWriter::update(DIEBuilder &DIEBlder, DWARFUnit &CU) {
       WriteAddress(0);
     WriteAddress(Val.second);
   }
-  updateAddressBase(DIEBlder, *this, CU, Offset);
 }
 
-void DebugAddrWriterDwarf5::update(DIEBuilder &DIEBlder, DWARFUnit &CU) {
+void DebugAddrWriterDwarf5::updateAddrBase(DIEBuilder &DIEBlder, DWARFUnit &CU,
+                                           const uint64_t Offset) {
+  /// Doesn't update address base if the CU doesn't access .debug_addr.
+  if (Map.indexToAddressBegin() == Map.indexToAdddessEnd()) {
+    std::optional<uint64_t> BaseOffset = CU.getAddrOffsetSectionBase();
+    if (!BaseOffset)
+      return;
+  }
+  /// Header for DWARF5 has size 8, so we add it to the offset.
+  updateAddressBase(DIEBlder, *this, CU, Offset + 8);
+}
+
+DenseMap<uint64_t, uint64_t> DebugAddrWriter::UnmodifiedAddressOffsets;
+
+void DebugAddrWriterDwarf5::update() {
   // Need to layout all sections within .debug_addr
   // Within each section sort Address by index.
   const endianness Endian = BC->DwCtx->isLittleEndian()
@@ -505,14 +511,11 @@ void DebugAddrWriterDwarf5::update(DIEBuilder &DIEBlder, DWARFUnit &CU) {
   DWARFDebugAddrTable AddrTable;
   DIDumpOptions DumpOpts;
   constexpr uint32_t HeaderSize = 8;
-  const uint64_t CUID = getCUID(CU);
-  const uint8_t AddrSize = CU.getAddressByteSize();
-  auto AMIter = AddressMaps.find(CUID);
+  const uint8_t AddrSize = AddressByteSize;
   // A case where CU has entry in .debug_addr, but we don't modify addresses
   // for it.
-  if (AMIter == AddressMaps.end()) {
-    AMIter = AddressMaps.insert({CUID, AddressForDWOCU()}).first;
-    std::optional<uint64_t> BaseOffset = CU.getAddrOffsetSectionBase();
+  if (Map.indexToAddressBegin() == Map.indexToAdddessEnd()) {
+    std::optional<uint64_t> BaseOffset = AddrOffsetSectionBase;
     if (!BaseOffset)
       return;
     // Address base offset is to the first entry.
@@ -520,7 +523,6 @@ void DebugAddrWriterDwarf5::update(DIEBuilder &DIEBlder, DWARFUnit &CU) {
     uint64_t Offset = *BaseOffset - HeaderSize;
     auto Iter = UnmodifiedAddressOffsets.find(Offset);
     if (Iter != UnmodifiedAddressOffsets.end()) {
-      updateAddressBase(DIEBlder, *this, CU, Iter->getSecond());
       return;
     }
     UnmodifiedAddressOffsets[Offset] = Buffer->size() + HeaderSize;
@@ -529,16 +531,13 @@ void DebugAddrWriterDwarf5::update(DIEBuilder &DIEBlder, DWARFUnit &CU) {
       DumpOpts.RecoverableErrorHandler(std::move(Err));
       return;
     }
-
     uint32_t Index = 0;
     for (uint64_t Addr : AddrTable.getAddressEntries())
-      AMIter->second.insert(Addr, Index++);
+      Map.insert(Addr, Index++);
   }
 
-  updateAddressBase(DIEBlder, *this, CU, Buffer->size() + HeaderSize);
-
-  std::vector<IndexAddressPair> SortedMap(AMIter->second.indexToAddressBegin(),
-                                          AMIter->second.indexToAdddessEnd());
+  std::vector<IndexAddressPair> SortedMap(Map.indexToAddressBegin(),
+                                          Map.indexToAdddessEnd());
   // Sorting address in increasing order of indices.
   llvm::sort(SortedMap, llvm::less_first());
   // Writing out Header
@@ -789,8 +788,6 @@ void DebugLoclistWriter::finalize(DIEBuilder &DIEBldr, DIE &Die) {
     finalizeDWARF5(DIEBldr, Die);
 }
 
-DebugAddrWriter *DebugLoclistWriter::AddrWriter = nullptr;
-
 static std::string encodeLE(size_t ByteSize, uint64_t NewValue) {
   std::string LE64(ByteSize, 0);
   for (size_t I = 0; I < ByteSize; ++I) {
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index c995df57ec276..49b2ec43ad001 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -617,7 +617,6 @@ void DWARFRewriter::updateDebugInfo() {
   if (BC.isDWARF5Used()) {
     AddrWriter = std::make_unique<DebugAddrWriterDwarf5>(&BC);
     RangeListsSectionWriter = std::make_unique<DebugRangeListsSectionWriter>();
-    DebugRangeListsSectionWriter::setAddressWriter(AddrWriter.get());
   } else {
     AddrWriter = std::make_unique<DebugAddrWriter>(&BC);
   }
@@ -625,8 +624,6 @@ void DWARFRewriter::updateDebugInfo() {
   if (BC.isDWARFLegacyUsed())
     LegacyRangesSectionWriter = std::make_unique<DebugRangesSectionWriter>();
 
-  DebugLoclistWriter::setAddressWriter(AddrWriter.get());
-
   uint32_t CUIndex = 0;
   std::mutex AccessMutex;
   // Needs to be invoked in the same order as CUs are processed.
@@ -634,8 +631,16 @@ void DWARFRewriter::updateDebugInfo() {
     std::lock_guard<std::mutex> Lock(AccessMutex);
     const uint16_t DwarfVersion = CU.getVersion();
     if (DwarfVersion >= 5) {
-      LocListWritersByCU[CUIndex] =
-          std::make_unique<DebugLoclistWriter>(CU, DwarfVersion, false);
+      auto AddrW = std::make_unique<DebugAddrWriterDwarf5>(&BC);
+      AddressWritersByCU[CU.getOffset()] = std::move(AddrW);
+      DebugRangeListsSectionWriter::setAddressWriter(
+          AddressWritersByCU[CU.getOffset()].get());
+      AddressWritersByCU[CU.getOffset()]->setAddressByteSize(
+          CU.getAddressByteSize());
+      AddressWritersByCU[CU.getOffset()]->setAddrOffsetSectionBase(
+          CU.getAddrOffsetSectionBase());
+      LocListWritersByCU[CUIndex] = std::make_unique<DebugLoclistWriter>(
+          CU, DwarfVersion, false, AddressWritersByCU[CU.getOffset()].get());
 
       if (std::optional<uint64_t> DWOId = CU.getDWOId()) {
         assert(RangeListsWritersByCU.count(*DWOId) == 0 &&
@@ -647,6 +652,10 @@ void DWARFRewriter::updateDebugInfo() {
       }
 
     } else {
+      auto AddrW = std::make_unique<DebugAddrWriter>(&BC);
+      AddressWritersByCU[CU.getOffset()] = std::move(AddrW);
+      AddressWritersByCU[CU.getOffset()]->setAddressByteSize(
+          CU.getAddressByteSize());
       LocListWritersByCU[CUIndex] = std::make_unique<DebugLocWriter>();
       if (std::optional<uint64_t> DWOId = CU.getDWOId()) {
         assert(LegacyRangesWritersByCU.count(*DWOId) == 0 &&
@@ -679,6 +688,7 @@ void DWARFRewriter::updateDebugInfo() {
     DebugRangesSectionWriter *RangesSectionWriter =
         Unit->getVersion() >= 5 ? RangeListsSectionWriter.get()
                                 : LegacyRangesSectionWriter.get();
+    auto &AddressWriter = AddressWritersByCU[Unit->getOffset()];
     // Skipping CUs that failed to load.
     if (SplitCU) {
       DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), DebugNamesTable,
@@ -699,7 +709,8 @@ void DWARFRewriter::updateDebugInfo() {
       DWODIEBuilder.updateDWONameCompDirForTypes(DWOStrOffstsWriter,
                                                  DWOStrWriter, **SplitCU,
                                                  DwarfOutputPath, DWOName);
-      DebugLoclistWriter DebugLocDWoWriter(*Unit, Unit->getVersion(), true);
+      DebugLoclistWriter DebugLocDWoWriter(*Unit, Unit->getVersion(), true,
+                                           AddressWriter.get());
       DebugRangesSectionWriter *TempRangesSectionWriter = RangesSectionWriter;
       if (Unit->getVersion() >= 5) {
         TempRangesSectionWriter = RangeListsWritersByCU[*DWOId].get();
@@ -710,7 +721,7 @@ void DWARFRewriter::updateDebugInfo() {
       }
 
       updateUnitDebugInfo(*(*SplitCU), DWODIEBuilder, DebugLocDWoWriter,
-                          *TempRangesSectionWriter);
+                          *TempRangesSectionWriter, *AddressWriter.get());
       DebugLocDWoWriter.finalize(DWODIEBuilder,
                                  *DWODIEBuilder.getUnitDIEbyUnit(**SplitCU));
       if (Unit->getVersion() >= 5)
@@ -729,11 +740,10 @@ void DWARFRewriter::updateDebugInfo() {
     }
 
     updateUnitDebugInfo(*Unit, *DIEBlder, *DebugLocWriter, *RangesSectionWriter,
-                        RangesBase);
+                        *AddressWriter.get(), RangesBase);
     DebugLocWriter->finalize(*DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
     if (Unit->getVersion() >= 5)
       RangesSectionWriter->finalizeSection();
-    AddrWriter->update(*DIEBlder, *Unit);
   };
 
   DIEBuilder DIEBlder(BC, BC.DwCtx.get(), DebugNamesTable);
@@ -783,7 +793,7 @@ void DWARFRewriter::updateDebugInfo() {
 void DWARFRewriter::updateUnitDebugInfo(
     DWARFUnit &Unit, DIEBuilder &DIEBldr, DebugLocWriter &DebugLocWriter,
     DebugRangesSectionWriter &RangesSectionWriter,
-    std::optional<uint64_t> RangesBase) {
+    DebugAddrWriter &AddressWriter, std::optional<uint64_t> RangesBase) {
   // Cache debug ranges so that the offset for identical ranges could be reused.
   std::map<DebugAddressRangesVector, uint64_t> CachedRanges;
 
@@ -817,7 +827,7 @@ void DWARFRewriter::updateUnitDebugInfo(
 
     if (FormLowPC == dwarf::DW_FORM_addrx ||
         FormLowPC == dwarf::DW_FORM_GNU_addr_index)
-      LowPC = AddrWriter->getIndexFromAddress(LowPC, Unit);
+      LowPC = AddressWriter.getIndexFromAddress(LowPC, Unit);
 
     if (LowPCVal)
       DIEBldr.replaceValue(Die, AttrLowPC, FormLowPC, DIEInteger(LowPC));
@@ -981,7 +991,7 @@ void DWARFRewriter::updateUnitDebugInfo(
 
         if (AttrVal.getForm() == dwarf::DW_FORM_addrx) {
           const uint32_t Index =
-              AddrWriter->getIndexFromAddress(UpdatedAddress, Unit);
+              AddressWriter.getIndexFromAddress(UpdatedAddress, Unit);
           DIEBldr.replaceValue(Die, AttrVal.getAttribute(), AttrVal.getForm(),
                                DIEInteger(Index));
         } else if (AttrVal.getForm() == dwarf::DW_FORM_addr) {
@@ -1198,7 +1208,7 @@ void DWARFRewriter::updateUnitDebugInfo(
                 assert(EntryAddress && "Address is not found.");
                 assert(Index <= std::numeric_limits<uint32_t>::max() &&
                        "Invalid Operand Index.");
-                const uint32_t AddrIndex = AddrWriter->getIndexFromAddress(
+                const uint32_t AddrIndex = AddressWriter.getIndexFromAddress(
                     EntryAddress->Address, Unit);
                 // update Index into .debug_address section for DW_AT_location.
                 // The Size field is not stored in IR, we need to minus 1 in
@@ -1250,7 +1260,7 @@ void DWARFRewriter::updateUnitDebugInfo(
           std::lock_guard<std::mutex> Lock(DWARFRewriterMutex);
           if (Form == dwarf::DW_FORM_addrx ||
               Form == dwarf::DW_FORM_GNU_addr_index) {
-            const uint32_t Index = AddrWriter->getIndexFromAddress(
+            const uint32_t Index = AddressWriter.getIndexFromAddress(
                 NewAddress ? NewAddress : Address, Unit);
             DIEBldr.replaceValue(Die, LowPCAttrInfo.getAttribute(),
                                  LowPCAttrInfo.getForm(), DIEInteger(Index));
@@ -1568,14 +1578,10 @@ void DWARFRewriter::finalizeDebugSections(
           LocationListSectionContents->size());
   }
 
-  // AddrWriter should be finalized after debug_loc since more addresses can be
-  // added there.
-  if (AddrWriter->isInitialized()) {
-    AddressSectionBuffer AddressSectionContents = AddrWriter->finalize();
-    BC.registerOrUpdateNoteSection(".debug_addr",
-                                   copyByteArray(AddressSectionContents),
-                                   AddressSectionContents.size());
-  }
+  AddressSectionBuffer AddressSectionContents = AddrWriter->finalize();
+  BC.registerOrUpdateNoteSection(".debug_addr",
+                                 copyByteArray(AddressSectionContents),
+                                 AddressSectionContents.size());
 
   Streamer.emitAbbrevs(DIEBlder.getAbbrevs(), BC.DwCtx->getMaxVersion());
   Streamer.finish();
@@ -1629,6 +1635,16 @@ void DWARFRewriter::finalizeCompileUnits(DIEBuilder &DIEBlder,
                                          CUOffsetMap &CUMap,
                                          const std::list<DWARFUnit *> &CUs) {
   for (DWARFUnit *CU : CUs) {
+    auto AddressWriterIterator = AddressWritersByCU.find(CU->getOffset());
+    assert(AddressWriterIterator != AddressWritersByCU.end() &&
+           "AddressWriter does not exist for CU");
+    auto &AddressWriter = AddressWriterIterator->second;
+    AddressWriter->update();
+    AddressWriter->updateAddrBase(DIEBlder, *CU, AddrWriter->finalize().size());
+    if (AddressWriter->isInitialized()) {
+      AddressSectionBuffer AddressSectionContents = AddressWriter->finalize();
+      AddrWriter->appendToAddressBuffer(AddressSectionContents);
+    }
     if (CU->getVersion() != 4)
       continue;
     std::optional<uint64_t> DWOId = CU->getDWOId();
@@ -2330,7 +2346,8 @@ void DWARFRewriter::convertToRangesPatchDebugInfo(
   // when it's absent.
   if (IsUnitDie) {
     if (LowForm == dwarf::DW_FORM_addrx) {
-      const uint32_t Index = AddrWriter->getIndexFromAddress(0, Unit);
+      const uint32_t Index =
+          AddressWritersByCU[Unit.getOffset()]->getIndexFromAddress(0, Unit);
       DIEBldr.replaceValue(&Die, LowPCAttrInfo.getAttribute(),
                            LowPCAttrInfo.getForm(), DIEInteger(Index));
     } else {