[llvm] [BOLT] Set call to continuation count in pre-aggregated profile in BAT mode (PR #115334)

Amir Ayupov via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 27 08:17:24 PST 2024


https://github.com/aaupov updated https://github.com/llvm/llvm-project/pull/115334

>From b6adebcc7ef911b4ca99cc8eb9aeed1681f95391 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Thu, 7 Nov 2024 08:01:35 -0800
Subject: [PATCH 1/4] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?=
 =?UTF-8?q?anges=20to=20main=20this=20commit=20is=20based=20on?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.4

[skip ci]
---
 bolt/docs/BAT.md                              |  12 +-
 bolt/include/bolt/Core/BinaryFunction.h       |   4 +
 .../bolt/Profile/BoltAddressTranslation.h     |   3 +
 bolt/include/bolt/Profile/DataAggregator.h    |   3 +-
 bolt/lib/Profile/BoltAddressTranslation.cpp   |  70 +++++----
 bolt/lib/Profile/DataAggregator.cpp           | 111 +++++++++-----
 bolt/test/X86/callcont-fallthru.s             | 138 ++++++++++++++++++
 7 files changed, 273 insertions(+), 68 deletions(-)
 create mode 100644 bolt/test/X86/callcont-fallthru.s

diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index 817ad288aa34ba..3b42c36541acd3 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -54,7 +54,7 @@ Functions table:
 |      table       |
 |                  |
 | Secondary entry  |
-|      points      |
+|  points and LPs  |
 |------------------|
 
 ```
@@ -80,7 +80,7 @@ Hot indices are delta encoded, implicitly starting at zero.
 | `HotIndex` | Delta, ULEB128 | Index of corresponding hot function in hot functions table | Cold |
 | `FuncHash` | 8b | Function hash for input function | Hot |
 | `NumBlocks` | ULEB128 | Number of basic blocks in the original function | Hot |
-| `NumSecEntryPoints` | ULEB128 | Number of secondary entry points in the original function | Hot |
+| `NumSecEntryPoints` | ULEB128 | Number of secondary entry points and landing pads in the original function | Hot |
 | `ColdInputSkew` | ULEB128 | Skew to apply to all input offsets | Cold |
 | `NumEntries` | ULEB128 | Number of address translation entries for a function | Both |
 | `EqualElems` | ULEB128 | Number of equal offsets in the beginning of a function | Both |
@@ -116,7 +116,11 @@ input basic block mapping.
 
 ### Secondary Entry Points table
 The table is emitted for hot fragments only. It contains `NumSecEntryPoints`
-offsets denoting secondary entry points, delta encoded, implicitly starting at zero.
+offsets denoting secondary entry points and landing pads, delta encoded,
+implicitly starting at zero.
 | Entry | Encoding | Description |
 | ----- | -------- | ----------- |
-| `SecEntryPoint` | Delta, ULEB128 | Secondary entry point offset |
+| `SecEntryPoint` | Delta, ULEB128 | Secondary entry point offset with `LPENTRY` LSB bit |
+
+`LPENTRY` bit denotes whether a given offset is a landing pad block. If not set,
+the offset is a secondary entry point.
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 0b875ed6881014..0b3682353f736e 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -908,6 +908,10 @@ class BinaryFunction {
     return BB && BB->getOffset() == Offset ? BB : nullptr;
   }
 
+  const BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) const {
+    return const_cast<BinaryFunction *>(this)->getBasicBlockAtOffset(Offset);
+  }
+
   /// Retrieve the landing pad BB associated with invoke instruction \p Invoke
   /// that is in \p BB. Return nullptr if none exists
   BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB,
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index fcc578f35e322a..aaf361b093bfdb 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -182,6 +182,9 @@ class BoltAddressTranslation {
   /// translation map entry
   const static uint32_t BRANCHENTRY = 0x1;
 
+  /// Identifies a landing pad in secondary entry point map entry.
+  const static uint32_t LPENTRY = 0x1;
+
 public:
   /// Map basic block input offset to a basic block index and hash pair.
   class BBHashMapTy {
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 6453b3070ceb8d..2880bfd03be789 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -266,7 +266,8 @@ class DataAggregator : public DataReader {
                      uint64_t Mispreds);
 
   /// Register a \p Branch.
-  bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
+  bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds,
+                bool IsPreagg);
 
   /// Register a trace between two LBR entries supplied in execution order.
   bool doTrace(const LBREntry &First, const LBREntry &Second,
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 4d005f942699e1..a423315df0793b 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -86,21 +86,16 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
     if (Function.isIgnored() || (!BC.HasRelocations && !Function.isSimple()))
       continue;
 
-    uint32_t NumSecondaryEntryPoints = 0;
-    Function.forEachEntryPoint([&](uint64_t Offset, const MCSymbol *) {
-      if (!Offset)
-        return true;
-      ++NumSecondaryEntryPoints;
-      SecondaryEntryPointsMap[OutputAddress].push_back(Offset);
-      return true;
-    });
-
     LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
     LLVM_DEBUG(dbgs() << " Address reference: 0x"
                       << Twine::utohexstr(Function.getOutputAddress()) << "\n");
     LLVM_DEBUG(dbgs() << formatv(" Hash: {0:x}\n", getBFHash(InputAddress)));
-    LLVM_DEBUG(dbgs() << " Secondary Entry Points: " << NumSecondaryEntryPoints
-                      << '\n');
+    LLVM_DEBUG({
+      uint32_t NumSecondaryEntryPoints = 0;
+      if (SecondaryEntryPointsMap.count(InputAddress))
+        NumSecondaryEntryPoints = SecondaryEntryPointsMap[InputAddress].size();
+      dbgs() << " Secondary Entry Points: " << NumSecondaryEntryPoints << '\n';
+    });
 
     MapTy Map;
     for (const BinaryBasicBlock *const BB :
@@ -206,10 +201,9 @@ void BoltAddressTranslation::writeMaps(uint64_t &PrevAddress, raw_ostream &OS) {
                       << Twine::utohexstr(Address) << ".\n");
     encodeULEB128(Address - PrevAddress, OS);
     PrevAddress = Address;
-    const uint32_t NumSecondaryEntryPoints =
-        SecondaryEntryPointsMap.count(Address)
-            ? SecondaryEntryPointsMap[Address].size()
-            : 0;
+    uint32_t NumSecondaryEntryPoints = 0;
+    if (SecondaryEntryPointsMap.count(HotInputAddress))
+      NumSecondaryEntryPoints = SecondaryEntryPointsMap[HotInputAddress].size();
     uint32_t Skew = 0;
     if (Cold) {
       auto HotEntryIt = llvm::lower_bound(HotFuncs, ColdPartSource[Address]);
@@ -281,7 +275,7 @@ void BoltAddressTranslation::writeMaps(uint64_t &PrevAddress, raw_ostream &OS) {
     if (!Cold && NumSecondaryEntryPoints) {
       LLVM_DEBUG(dbgs() << "Secondary entry points: ");
       // Secondary entry point offsets, delta-encoded
-      for (uint32_t Offset : SecondaryEntryPointsMap[Address]) {
+      for (uint32_t Offset : SecondaryEntryPointsMap[HotInputAddress]) {
         encodeULEB128(Offset - PrevOffset, OS);
         LLVM_DEBUG(dbgs() << formatv("{0:x} ", Offset));
         PrevOffset = Offset;
@@ -469,8 +463,12 @@ void BoltAddressTranslation::dump(raw_ostream &OS) const {
       const std::vector<uint32_t> &SecondaryEntryPoints =
           SecondaryEntryPointsIt->second;
       OS << SecondaryEntryPoints.size() << " secondary entry points:\n";
-      for (uint32_t EntryPointOffset : SecondaryEntryPoints)
-        OS << formatv("{0:x}\n", EntryPointOffset);
+      for (uint32_t EntryPointOffset : SecondaryEntryPoints) {
+        OS << formatv("{0:x}", EntryPointOffset >> 1);
+        if (EntryPointOffset & LPENTRY)
+          OS << " (lp)";
+        OS << '\n';
+      }
     }
     OS << "\n";
   }
@@ -582,14 +580,21 @@ void BoltAddressTranslation::saveMetadata(BinaryContext &BC) {
     // changed
     if (BF.isIgnored() || (!BC.HasRelocations && !BF.isSimple()))
       continue;
+    const uint64_t FuncAddress = BF.getAddress();
     // Prepare function and block hashes
-    FuncHashes.addEntry(BF.getAddress(), BF.computeHash());
+    FuncHashes.addEntry(FuncAddress, BF.computeHash());
     BF.computeBlockHashes();
-    BBHashMapTy &BBHashMap = getBBHashMap(BF.getAddress());
+    BBHashMapTy &BBHashMap = getBBHashMap(FuncAddress);
+    std::vector<uint32_t> SecondaryEntryPoints;
     // Set BF/BB metadata
-    for (const BinaryBasicBlock &BB : BF)
+    for (const BinaryBasicBlock &BB : BF) {
       BBHashMap.addEntry(BB.getInputOffset(), BB.getIndex(), BB.getHash());
-    NumBasicBlocksMap.emplace(BF.getAddress(), BF.size());
+      bool IsLandingPad = BB.isLandingPad();
+      if (IsLandingPad || BF.getSecondaryEntryPointSymbol(BB))
+        SecondaryEntryPoints.emplace_back(BB.getOffset() << 1 | IsLandingPad);
+    }
+    SecondaryEntryPointsMap.emplace(FuncAddress, SecondaryEntryPoints);
+    NumBasicBlocksMap.emplace(FuncAddress, BF.size());
   }
 }
 
@@ -599,13 +604,20 @@ BoltAddressTranslation::getSecondaryEntryPointId(uint64_t Address,
   auto FunctionIt = SecondaryEntryPointsMap.find(Address);
   if (FunctionIt == SecondaryEntryPointsMap.end())
     return 0;
-  const std::vector<uint32_t> &Offsets = FunctionIt->second;
-  auto OffsetIt = std::find(Offsets.begin(), Offsets.end(), Offset);
-  if (OffsetIt == Offsets.end())
-    return 0;
-  // Adding one here because main entry point is not stored in BAT, and
-  // enumeration for secondary entry points starts with 1.
-  return OffsetIt - Offsets.begin() + 1;
+  unsigned EntryPoints = 0;
+  // Note we need to scan the vector to get the entry point id because it
+  // contains both entry points and landing pads.
+  for (uint32_t Off : FunctionIt->second) {
+    // Skip landing pads.
+    if (Off & LPENTRY)
+      continue;
+    // Adding one here because main entry point is not stored in BAT, and
+    // enumeration for secondary entry points starts with 1.
+    if (Off >> 1 == Offset)
+      return EntryPoints + 1;
+    ++EntryPoints;
+  }
+  return 0;
 }
 
 std::pair<const BinaryFunction *, unsigned>
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index ffd693f9bbaed4..f667fcd4f049ae 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -778,42 +778,75 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
 }
 
 bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
-                              uint64_t Mispreds) {
-  bool IsReturn = false;
-  auto handleAddress = [&](uint64_t &Addr, bool IsFrom) -> BinaryFunction * {
-    if (BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr)) {
-      Addr -= Func->getAddress();
-      if (IsFrom) {
-        auto checkReturn = [&](auto MaybeInst) {
-          IsReturn = MaybeInst && BC->MIB->isReturn(*MaybeInst);
-        };
-        if (Func->hasInstructions())
-          checkReturn(Func->getInstructionAtOffset(Addr));
-        else
-          checkReturn(Func->disassembleInstructionAtOffset(Addr));
-      }
+                              uint64_t Mispreds, bool IsPreagg) {
+  // Returns whether \p Offset in \p Func contains a return instruction.
+  auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
+    auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
+    return Func.hasInstructions()
+               ? isReturn(Func.getInstructionAtOffset(Offset))
+               : isReturn(Func.disassembleInstructionAtOffset(Offset));
+  };
 
-      if (BAT)
-        Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
+  // Returns whether \p Offset in \p Func may be a call continuation excluding
+  // entry points and landing pads.
+  auto checkCallCont = [&](const BinaryFunction &Func, const uint64_t Offset) {
+    // No call continuation at a function start.
+    if (!Offset)
+      return false;
+
+    // FIXME: support BAT case where the function might be in empty state
+    // (split fragments declared non-simple).
+    if (!Func.hasCFG())
+      return false;
+
+    // The offset should not be an entry point or a landing pad.
+    const BinaryBasicBlock *ContBB = Func.getBasicBlockAtOffset(Offset);
+    return ContBB && !ContBB->isEntryPoint() && !ContBB->isLandingPad();
+  };
 
-      if (BinaryFunction *ParentFunc = getBATParentFunction(*Func)) {
-        Func = ParentFunc;
-        if (IsFrom)
-          NumColdSamples += Count;
-      }
+  // Mutates \p Addr to an offset into the containing function, performing BAT
+  // offset translation and parent lookup.
+  //
+  // Returns the containing function (or BAT parent) and whether the address
+  // corresponds to a return (if \p IsFrom) or a call continuation (otherwise).
+  auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
+    BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
+    if (!Func)
+      return std::pair{Func, false};
 
-      return Func;
-    }
-    return nullptr;
+    Addr -= Func->getAddress();
+
+    bool IsRetOrCallCont =
+        IsFrom ? checkReturn(*Func, Addr) : checkCallCont(*Func, Addr);
+
+    if (BAT)
+      Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
+
+    BinaryFunction *ParentFunc = getBATParentFunction(*Func);
+    if (!ParentFunc)
+      return std::pair{Func, IsRetOrCallCont};
+
+    if (IsFrom)
+      NumColdSamples += Count;
+
+    return std::pair{ParentFunc, IsRetOrCallCont};
   };
 
-  BinaryFunction *FromFunc = handleAddress(From, /*IsFrom=*/true);
+  uint64_t ToOrig = To;
+  auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom=*/true);
+  auto [ToFunc, IsCallCont] = handleAddress(To, /*IsFrom=*/false);
+  if (!FromFunc && !ToFunc)
+    return false;
+
+  // Record call to continuation trace.
+  if (IsPreagg && FromFunc != ToFunc && (IsReturn || IsCallCont)) {
+    LBREntry First{ToOrig - 1, ToOrig - 1, false};
+    LBREntry Second{ToOrig, ToOrig, false};
+    return doTrace(First, Second, Count);
+  }
   // Ignore returns.
   if (IsReturn)
     return true;
-  BinaryFunction *ToFunc = handleAddress(To, /*IsFrom=*/false);
-  if (!FromFunc && !ToFunc)
-    return false;
 
   // Treat recursive control transfers as inter-branches.
   if (FromFunc == ToFunc && To != 0) {
@@ -830,10 +863,19 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
   BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(Second.From);
   if (!FromFunc || !ToFunc) {
     LLVM_DEBUG({
-      dbgs() << "Out of range trace starting in " << FromFunc->getPrintName()
-             << formatv(" @ {0:x}", First.To - FromFunc->getAddress())
-             << " and ending in " << ToFunc->getPrintName()
-             << formatv(" @ {0:x}\n", Second.From - ToFunc->getAddress());
+      dbgs() << "Out of range trace starting in ";
+      if (FromFunc)
+        dbgs() << formatv("{0} @ {1:x}", *FromFunc,
+                          First.To - FromFunc->getAddress());
+      else
+        dbgs() << Twine::utohexstr(First.To);
+      dbgs() << " and ending in ";
+      if (ToFunc)
+        dbgs() << formatv("{0} @ {1:x}", *ToFunc,
+                          Second.From - ToFunc->getAddress());
+      else
+        dbgs() << Twine::utohexstr(Second.From);
+      dbgs() << '\n';
     });
     NumLongRangeTraces += Count;
     return false;
@@ -1620,7 +1662,8 @@ void DataAggregator::processBranchEvents() {
   for (const auto &AggrLBR : BranchLBRs) {
     const Trace &Loc = AggrLBR.first;
     const TakenBranchInfo &Info = AggrLBR.second;
-    doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
+    doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount,
+             /*IsPreagg*/ false);
   }
 }
 
@@ -1781,7 +1824,7 @@ void DataAggregator::processPreAggregated() {
     switch (AggrEntry.EntryType) {
     case AggregatedLBREntry::BRANCH:
       doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count,
-               AggrEntry.Mispreds);
+               AggrEntry.Mispreds, /*IsPreagg=*/true);
       break;
     case AggregatedLBREntry::FT:
     case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: {
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
new file mode 100644
index 00000000000000..641beb79ecf2ac
--- /dev/null
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -0,0 +1,138 @@
+## Ensures that a call continuation fallthrough count is set when using
+## pre-aggregated perf data.
+
+# RUN: %clangxx %cxxflags %s -o %t -Wl,-q -nostdlib
+# RUN: link_fdata %s %t %t.pa1 PREAGG
+# RUN: link_fdata %s %t %t.pa2 PREAGG2
+# RUN: link_fdata %s %t %t.pa3 PREAGG3
+# RUN: link_fdata %s %t %t.pa4 PREAGG4
+
+## Check normal case: fallthrough is not LP or secondary entry.
+# RUN: llvm-strip --strip-unneeded %t -o %t.exe
+# RUN: llvm-bolt %t.exe --pa -p %t.pa1 -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s
+
+## Check that getFallthroughsInTrace correctly handles a trace starting at plt
+## call continuation
+# RUN: llvm-bolt %t.exe --pa -p %t.pa2 -o %t.out2 \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2
+
+## Check that we don't treat secondary entry points as call continuation sites.
+# RUN: llvm-bolt %t --pa -p %t.pa3 -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
+
+## Check fallthrough to a landing pad case.
+# RUN: llvm-bolt %t.exe --pa -p %t.pa4 -o %t.out --enable-bat \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK4
+
+## Check that a landing pad is emitted in BAT
+# RUN: llvm-bat-dump %t.out --dump-all | FileCheck %s --check-prefix=CHECK-BAT
+
+# CHECK-BAT:      1 secondary entry points:
+# CHECK-BAT-NEXT: 0x38 (lp)
+
+  .globl foo
+  .type foo, %function
+foo:
+	pushq	%rbp
+	movq	%rsp, %rbp
+	popq	%rbp
+Lfoo_ret:
+	retq
+.size foo, .-foo
+
+  .globl main
+  .type main, %function
+main:
+.Lfunc_begin0:
+	.cfi_startproc
+	.cfi_personality 155, DW.ref.__gxx_personality_v0
+	.cfi_lsda 27, .Lexception0
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$0x20, %rsp
+	movl	$0x0, -0x4(%rbp)
+	movl	%edi, -0x8(%rbp)
+	movq	%rsi, -0x10(%rbp)
+	callq	puts at PLT
+## Target is a call continuation
+# PREAGG: B X:0 #Ltmp1# 2 0
+# CHECK:      callq puts at PLT
+# CHECK-NEXT: count: 2
+
+Ltmp1:
+	movq	-0x10(%rbp), %rax
+	movq	0x8(%rax), %rdi
+	movl	%eax, -0x14(%rbp)
+
+Ltmp4:
+	cmpl	$0x0, -0x14(%rbp)
+	je	Ltmp0
+# CHECK2:      je .Ltmp0
+# CHECK2-NEXT: count: 3
+
+	movl	$0xa, -0x18(%rbp)
+	callq	foo
+## Target is a call continuation
+# PREAGG: B #Lfoo_ret# #Ltmp3# 1 0
+# CHECK:      callq foo
+# CHECK-NEXT: count: 1
+
+## PLT call continuation fallthrough spanning the call
+# PREAGG2: F #Ltmp1# #Ltmp3_br# 3
+# CHECK2:      callq foo
+# CHECK2-NEXT: count: 3
+
+## Target is a secondary entry point
+# PREAGG3: B X:0 #Ltmp3# 2 0
+# CHECK3:      callq foo
+# CHECK3-NEXT: count: 0
+
+## Target is a landing pad
+# PREAGG4: B X:0 #Ltmp3# 2 0
+# CHECK4:      callq puts at PLT
+# CHECK4-NEXT: count: 0
+
+Ltmp3:
+	cmpl	$0x0, -0x18(%rbp)
+Ltmp3_br:
+	jmp	Ltmp2
+
+Ltmp2:
+	movl	-0x18(%rbp), %eax
+	addl	$-0x1, %eax
+	movl	%eax, -0x18(%rbp)
+	jmp	Ltmp3
+	jmp	Ltmp4
+	jmp	Ltmp1
+
+Ltmp0:
+	xorl	%eax, %eax
+	addq	$0x20, %rsp
+	popq	%rbp
+	retq
+.Lfunc_end0:
+  .cfi_endproc
+.size main, .-main
+
+	.section	.gcc_except_table,"a", at progbits
+	.p2align	2, 0x0
+GCC_except_table0:
+.Lexception0:
+	.byte	255                             # @LPStart Encoding = omit
+	.byte	255                             # @TType Encoding = omit
+	.byte	1                               # Call site Encoding = uleb128
+	.uleb128 .Lcst_end0-.Lcst_begin0
+.Lcst_begin0:
+	.uleb128 .Lfunc_begin0-.Lfunc_begin0    # >> Call Site 1 <<
+	.uleb128 .Lfunc_end0-.Lfunc_begin0           #   Call between .Lfunc_begin0 and .Lfunc_end0
+	.uleb128 Ltmp3-.Lfunc_begin0           #     jumps to Ltmp3
+	.byte	0                               #     has no landing pad
+	.byte	0                               #   On action: cleanup
+.Lcst_end0:
+	.p2align	2, 0x0
+	.hidden	DW.ref.__gxx_personality_v0
+	.weak	DW.ref.__gxx_personality_v0
+	.section	.data.DW.ref.__gxx_personality_v0,"awG", at progbits,DW.ref.__gxx_personality_v0,comdat
+	.p2align	3, 0x0
+	.type	DW.ref.__gxx_personality_v0, at object

>From 9476fad1aa50282a38614a63a6a5a41f0ac42532 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Wed, 27 Nov 2024 15:59:02 +0100
Subject: [PATCH 2/4] Preserve CFG until BAT, use it to check call cont landing
 pads, encode them in secondary entry points table

---
 bolt/docs/BAT.md                              |  20 +---
 .../bolt/Profile/BoltAddressTranslation.h     |  26 +----
 bolt/lib/Core/BinaryEmitter.cpp               |   3 +-
 bolt/lib/Profile/BoltAddressTranslation.cpp   | 104 +++++-------------
 bolt/lib/Profile/DataAggregator.cpp           |  17 +--
 5 files changed, 44 insertions(+), 126 deletions(-)

diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index 20340884621b9a..828114310e195f 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -115,21 +115,13 @@ Deleted basic blocks are emitted as having `OutputOffset` equal to the size of
 the function. They don't affect address translation and only participate in
 input basic block mapping.
 
-### Secondary Entry Points table
+### Secondary Entry Points and Call Continuation Landing Pads table
 The table is emitted for hot fragments only. It contains `NumSecEntryPoints`
-offsets denoting secondary entry points, delta encoded, implicitly starting at zero.
+offsets, delta encoded, implicitly starting at zero.
 | Entry | Encoding | Description |
 | ----- | -------- | ----------- |
-| `SecEntryPoint` | Delta, ULEB128 | Secondary entry point offset |
+| `OutputOffset` | Delta, ULEB128 | An offset of secondary entry point or a call continuation landing pad\*|
 
-### Call continuation landing pads table
-This table contains the addresses of call continuation blocks that are also
-landing pads, to aid pre-aggregated profile conversion. The table is optional
-for backwards compatibility, but new versions of BOLT will always emit it.
-
-| Entry | Encoding | Description |
-| ----- | -------- | ----------- |
-| `NumEntries` | ULEB128 | Number of addresses |
-| `InputAddress` | Delta, ULEB128 | `NumEntries` input addresses of call continuation landing pad blocks |
-
-Addresses are delta encoded, implicitly starting at zero.
+Call continuation landing pads offsets are shifted by the size of the function
+for backwards compatibility (treated as entry points past the end of the
+function).
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index b04ed7a82eeefb..f956f48b8356b2 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -143,20 +143,12 @@ class BoltAddressTranslation {
   /// Write the serialized address translation table for a function.
   template <bool Cold> void writeMaps(uint64_t &PrevAddress, raw_ostream &OS);
 
-  /// Write call continuation landing pad addresses.
-  void writeCallContLandingPads(raw_ostream &OS);
-
   /// Read the serialized address translation table for a function.
   /// Return a parse error if failed.
   template <bool Cold>
   void parseMaps(uint64_t &PrevAddress, DataExtractor &DE, uint64_t &Offset,
                  Error &Err);
 
-
-  /// Read the table with call continuation landing pad offsets.
-  void parseCallContLandingPads(DataExtractor &DE, uint64_t &Offset,
-                                Error &Err);
-
   /// Returns the bitmask with set bits corresponding to indices of BRANCHENTRY
   /// entries in function address translation map.
   APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems) const;
@@ -176,14 +168,6 @@ class BoltAddressTranslation {
   /// Map a function to its secondary entry points vector
   std::unordered_map<uint64_t, std::vector<uint32_t>> SecondaryEntryPointsMap;
 
-  /// Vector with call continuation landing pads input addresses (pre-BOLT
-  /// binary).
-  std::vector<uint64_t> CallContLandingPadAddrs;
-
-  /// Return a secondary entry point ID for a function located at \p Address and
-  /// \p Offset within that function.
-  unsigned getSecondaryEntryPointId(uint64_t Address, uint32_t Offset) const;
-
   /// Links outlined cold bocks to their original function
   std::map<uint64_t, uint64_t> ColdPartSource;
 
@@ -195,13 +179,9 @@ class BoltAddressTranslation {
   const static uint32_t BRANCHENTRY = 0x1;
 
 public:
-  /// Returns whether a given \p Offset is a secondary entry point in function
-  /// with address \p Address.
-  bool isSecondaryEntry(uint64_t Address, uint32_t Offset) const;
-
-  /// Returns whether a given \p Offset is a call continuation landing pad in
-  /// function with address \p Address.
-  bool isCallContinuationLandingPad(uint64_t Address, uint32_t Offset) const;
+  /// Return a secondary entry point ID for a function located at \p Address and
+  /// \p Offset within that function.
+  unsigned getSecondaryEntryPointId(uint64_t Address, uint32_t Offset) const;
 
   /// Map basic block input offset to a basic block index and hash pair.
   class BBHashMapTy {
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index f34a94c5779213..31b93418f53943 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -257,7 +257,8 @@ void BinaryEmitter::emitFunctions() {
       Streamer.setAllowAutoPadding(OriginalAllowAutoPadding);
 
       if (Emitted)
-        Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics);
+        Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics ||
+                             opts::EnableBAT);
     }
   };
 
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 49b272ebd35c1e..6c26ed3e26b419 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -87,13 +87,28 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
       continue;
 
     uint32_t NumSecondaryEntryPoints = 0;
-    Function.forEachEntryPoint([&](uint64_t Offset, const MCSymbol *) {
-      if (!Offset)
-        return true;
+    for (const BinaryBasicBlock &BB : llvm::drop_begin(Function)) {
+      if (BB.isEntryPoint()) {
+        ++NumSecondaryEntryPoints;
+        SecondaryEntryPointsMap[OutputAddress].push_back(BB.getOffset());
+        continue;
+      }
+      // Add call continuation landing pads, offset by function size
+      if (!BB.isLandingPad())
+        continue;
+      const BinaryBasicBlock *PrevBB =
+          Function.getLayout().getBlock(BB.getIndex() - 1);
+      if (!PrevBB->isSuccessor(&BB))
+        continue;
+      const MCInst *Instr = PrevBB->getLastNonPseudoInstr();
+      if (!Instr || !BC.MIB->isCall(*Instr))
+        continue;
       ++NumSecondaryEntryPoints;
-      SecondaryEntryPointsMap[OutputAddress].push_back(Offset);
-      return true;
-    });
+      SecondaryEntryPointsMap[OutputAddress].push_back(
+          Function.getOutputSize() + BB.getOffset());
+    }
+    if (NumSecondaryEntryPoints)
+      llvm::sort(SecondaryEntryPointsMap[OutputAddress]);
 
     LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
     LLVM_DEBUG(dbgs() << " Address reference: 0x"
@@ -145,7 +160,6 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
   uint64_t PrevAddress = 0;
   writeMaps</*Cold=*/false>(PrevAddress, OS);
   writeMaps</*Cold=*/true>(PrevAddress, OS);
-  writeCallContLandingPads(OS);
 
   BC.outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
   BC.outs() << "BOLT-INFO: Wrote " << FuncHashes.getNumFunctions()
@@ -292,15 +306,6 @@ void BoltAddressTranslation::writeMaps(uint64_t &PrevAddress, raw_ostream &OS) {
   }
 }
 
-void BoltAddressTranslation::writeCallContLandingPads(raw_ostream &OS) {
-  encodeULEB128(CallContLandingPadAddrs.size(), OS);
-  uint64_t PrevAddress = 0;
-  for (const uint64_t Address : CallContLandingPadAddrs) {
-    encodeULEB128(Address - PrevAddress, OS);
-    PrevAddress = Address;
-  }
-}
-
 std::error_code BoltAddressTranslation::parse(raw_ostream &OS, StringRef Buf) {
   DataExtractor DE = DataExtractor(Buf, true, 8);
   uint64_t Offset = 0;
@@ -325,8 +330,6 @@ std::error_code BoltAddressTranslation::parse(raw_ostream &OS, StringRef Buf) {
   parseMaps</*Cold=*/false>(PrevAddress, DE, Offset, Err);
   parseMaps</*Cold=*/true>(PrevAddress, DE, Offset, Err);
   OS << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
-  if (Offset < Buf.size())
-    parseCallContLandingPads(DE, Offset, Err);
   return errorToErrorCode(std::move(Err));
 }
 
@@ -446,21 +449,6 @@ void BoltAddressTranslation::parseMaps(uint64_t &PrevAddress, DataExtractor &DE,
   }
 }
 
-void BoltAddressTranslation::parseCallContLandingPads(DataExtractor &DE,
-                                                      uint64_t &Offset,
-                                                      Error &Err) {
-  const uint32_t NumEntries = DE.getULEB128(&Offset, &Err);
-  LLVM_DEBUG(dbgs() << "Parsing " << NumEntries
-                    << " call continuation landing pad entries\n");
-  CallContLandingPadAddrs.reserve(NumEntries);
-  uint64_t PrevAddress = 0;
-  for (uint32_t I = 0; I < NumEntries; ++I) {
-    const uint64_t Address = PrevAddress + DE.getULEB128(&Offset, &Err);
-    CallContLandingPadAddrs.emplace_back(Address);
-    PrevAddress = Address;
-  }
-}
-
 void BoltAddressTranslation::dump(raw_ostream &OS) const {
   const size_t NumTables = Maps.size();
   OS << "BAT tables for " << NumTables << " functions:\n";
@@ -609,25 +597,14 @@ void BoltAddressTranslation::saveMetadata(BinaryContext &BC) {
     // changed
     if (BF.isIgnored() || (!BC.HasRelocations && !BF.isSimple()))
       continue;
-    const uint64_t FuncAddress = BF.getAddress();
     // Prepare function and block hashes
-    FuncHashes.addEntry(FuncAddress, BF.computeHash());
+    FuncHashes.addEntry(BF.getAddress(), BF.computeHash());
     BF.computeBlockHashes();
-    BBHashMapTy &BBHashMap = getBBHashMap(FuncAddress);
+    BBHashMapTy &BBHashMap = getBBHashMap(BF.getAddress());
     // Set BF/BB metadata
-    for (const BinaryBasicBlock &BB : BF) {
-      const uint32_t BlockOffset = BB.getInputOffset();
-      BBHashMap.addEntry(BlockOffset, BB.getIndex(), BB.getHash());
-      // Set CallContLandingPads
-      if (!BB.isEntryPoint() && BB.isLandingPad()) {
-        const BinaryBasicBlock *PrevBB =
-            BF.getLayout().getBlock(BB.getIndex() - 1);
-        const MCInst *Instr = PrevBB->getLastNonPseudoInstr();
-        if (Instr && BC.MIB->isCall(*Instr))
-          CallContLandingPadAddrs.emplace_back(FuncAddress + BlockOffset);
-      }
-    }
-    NumBasicBlocksMap.emplace(FuncAddress, BF.size());
+    for (const BinaryBasicBlock &BB : BF)
+      BBHashMap.addEntry(BB.getInputOffset(), BB.getIndex(), BB.getHash());
+    NumBasicBlocksMap.emplace(BF.getAddress(), BF.size());
   }
 }
 
@@ -638,8 +615,8 @@ BoltAddressTranslation::getSecondaryEntryPointId(uint64_t Address,
   if (FunctionIt == SecondaryEntryPointsMap.end())
     return 0;
   const std::vector<uint32_t> &Offsets = FunctionIt->second;
-  auto OffsetIt = std::find(Offsets.begin(), Offsets.end(), Offset);
-  if (OffsetIt == Offsets.end())
+  auto OffsetIt = llvm::lower_bound(FunctionIt->second, Offset);
+  if (OffsetIt == Offsets.end() || *OffsetIt != Offset)
     return 0;
   // Adding one here because main entry point is not stored in BAT, and
   // enumeration for secondary entry points starts with 1.
@@ -675,30 +652,5 @@ BoltAddressTranslation::translateSymbol(const BinaryContext &BC,
   return std::pair(ParentBF, SecondaryEntryId);
 }
 
-bool BoltAddressTranslation::isSecondaryEntry(uint64_t OutputAddress,
-                                              uint32_t Offset) const {
-  const uint64_t InputOffset =
-      translate(OutputAddress, Offset, /*IsBranchSrc*/ false);
-
-  const uint64_t HotAddress = fetchParentAddress(OutputAddress);
-  auto MapsIter = Maps.find(HotAddress ? HotAddress : OutputAddress);
-  if (MapsIter == Maps.end())
-    return false;
-
-  const uint64_t InputAddress = MapsIter->second.begin()->second;
-
-  auto FunctionIt = SecondaryEntryPointsMap.find(Address);
-  if (FunctionIt == SecondaryEntryPointsMap.end())
-    return false;
-  const std::vector<uint32_t> &Offsets = FunctionIt->second;
-  uint64_t InputOffset = translate(OutputAddress, Offset, /*IsBranchSrc*/ false);
-  auto OffsetIt = llvm::lower_bound(Offsets, InputOffset << 1);
-  return OffsetIt != Offsets.end() && *OffsetIt >> 1 == InputOffset;
-}
-
-bool BoltAddressTranslation::isCallContinuationLandingPad(
-    uint64_t Address, uint32_t Offset) const {
-}
-
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index c471e97565a576..fcd3cd90271a26 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -809,18 +809,11 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
       return false;
 
     if (!Func.hasCFG()) {
-      if (!BAT)
-        return false;
-      const uint64_t FuncAddress = Func.getAddress();
-      const BinaryData *BD =
-        BC->getBinaryDataAtAddress(FuncAddress + Offset);
-      unsigned EntryID = 0;
-      if (BD && BD->getSymbol())
-        EntryID = BAT->translateSymbol(*BC, *BD->getSymbol(), 0).second;
-      const MCSymbol &Symbol =
-      const auto [Function, EntryID] = BAT->translateSymbol(BC,
-      return BAT && !(BAT->isSecondaryEntry(FuncAddress, Offset) ||
-                      BAT->isCallContinuationLandingPad(FuncAddress, Offset));
+      const uint64_t Address = Func.getAddress();
+      // Check if offset is a secondary entry point or a call continuation
+      // landing pad (offset shifted by function size).
+      return BAT && !BAT->getSecondaryEntryPointId(Address, Offset) &&
+             !BAT->getSecondaryEntryPointId(Address, Func.getSize() + Offset);
     }
 
     // The offset should not be an entry point or a landing pad.

>From fe9900b5bbb20b8fe3b036d7636c2bf8e25e6c88 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Wed, 27 Nov 2024 07:40:26 -0800
Subject: [PATCH 3/4] Preserve CFG until BAT, use it to check call cont landing
 pads, encode them in secondary entry points table

Created using spr 1.3.4
---
 bolt/docs/BAT.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index 828114310e195f..afbd59220b045c 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -44,7 +44,6 @@ The general layout is as follows:
 ```
 Hot functions table
 Cold functions table
-Call continuation landing pads table (optional)
 
 Functions table:
 |------------------|

>From 567d8103a7d9175e9ec5ac366fff73091a6032ca Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Wed, 27 Nov 2024 08:17:09 -0800
Subject: [PATCH 4/4] update test

Created using spr 1.3.4
---
 bolt/test/X86/callcont-fallthru.s | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index 467600abfb6940..f6f2c7d117a08f 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -25,28 +25,27 @@
 # RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
 
 ## Check that a landing pad is emitted in BAT
-# RUN: llvm-bat-dump %t.out --dump-all | FileCheck %s --check-prefix=CHECK-BAT
+# RUN: llvm-bat-dump %t.out4 --dump-all | FileCheck %s --check-prefix=CHECK-BAT
 
 # CHECK-BAT:      1 secondary entry points:
-# CHECK-BAT-NEXT: 0x38 (lp)
 
 ## Check BAT case of a fallthrough to a call continuation
-# link_fdata %s %t.out3 %t.pa.bat PREAGG
-# RUN: perf2bolt %t.out3 -p %t.pa.bat --pa -o %t.fdata
+# link_fdata %s %t.out4 %t.pa.bat PREAGG
+# RUN: perf2bolt %t.out4 -p %t.pa.bat --pa -o %t.fdata
 # RUN: FileCheck %s --check-prefix=CHECK-BAT-CC --input-file=%t.fdata
 # CHECK-BAT-CC: main
 
 ## Check BAT case of a fallthrough to a secondary entry point or a landing pad
-# link_fdata %s %t.out3 %t.pa.bat2 PREAGG3
+# link_fdata %s %t.out4 %t.pa.bat2 PREAGG3
 
 ## Secondary entry
-# RUN: perf2bolt %t.out3 -p %t.pa.bat2 --pa -o %t.fdata2
+# RUN: perf2bolt %t.out4 -p %t.pa.bat2 --pa -o %t.fdata2
 # RUN: FileCheck %s --check-prefix=CHECK-BAT-ENTRY --input-file=%t.fdata2
 # CHECK-BAT-ENTRY: main
 
 ## Landing pad
-# RUN: llvm-strip --strip-unneeded %t.out3
-# RUN: perf2bolt %t.out3 -p %t.pa.bat2 --pa -o %t.fdata3
+# RUN: llvm-strip --strip-unneeded %t.out4
+# RUN: perf2bolt %t.out4 -p %t.pa.bat2 --pa -o %t.fdata3
 # RUN: FileCheck %s --check-prefix=CHECK-BAT-LP --input-file=%t.fdata3
 # CHECK-BAT-LP: main
 



More information about the llvm-commits mailing list