[llvm] [BOLT] Add pre-aggregated trace support (PR #127125)

Amir Ayupov via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 13 12:54:14 PST 2025


https://github.com/aaupov created https://github.com/llvm/llvm-project/pull/127125

Traces are triplets of branch source, target, and fall-through end (next
branch).

Traces simplify differentiation of fall-throughs into local- and
external-origin, which improves performance over profile with
undifferentiated fall-throughs by eliminating profile discontinuity in
call to continuation fall-throughs. This makes it possible to avoid
converting return profile into call to continuation profile which may
introduce statistical biases.

The existing format makes provisions for local- (F) and external- (f)
origin fall-throughs, but the profile producer needs to know function
boundaries. BOLT has that information readily available, so providing
the origin branch of a fall-through is a functional replacement of the
fall-through kind (f or F). This also has an effect of combining
branches and fall-throughs into a single record.

As traces subsume other pre-aggregated profile kinds, BOLT may drop
support for them soon. Users of pre-aggregated profile format are
advised to migrate to the trace format.

Test Plan: Updated callcont-fallthru.s


>From c519c8039d3648e3c88d727e40e07d233d535072 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at meta.com>
Date: Thu, 13 Feb 2025 12:54:04 -0800
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.4
---
 bolt/include/bolt/Profile/DataAggregator.h | 18 ++++--
 bolt/lib/Profile/DataAggregator.cpp        | 66 +++++++++++++++-------
 bolt/test/X86/callcont-fallthru.s          | 48 ++++++++++------
 bolt/test/link_fdata.py                    |  6 +-
 4 files changed, 91 insertions(+), 47 deletions(-)

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index aa83d7f9b13ab..56eb463fc98fc 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -94,7 +94,7 @@ class DataAggregator : public DataReader {
 
   /// Used for parsing specific pre-aggregated input files.
   struct AggregatedLBREntry {
-    enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN };
+    enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN, TRACE };
     Location From;
     Location To;
     uint64_t Count;
@@ -197,6 +197,10 @@ class DataAggregator : public DataReader {
 
   BoltAddressTranslation *BAT{nullptr};
 
+  /// Whether pre-aggregated profile needs to convert branch profile into call
+  /// to continuation fallthrough profile.
+  bool NeedsConvertRetProfileToCallCont{false};
+
   /// Update function execution profile with a recorded trace.
   /// A trace is region of code executed between two LBR entries supplied in
   /// execution order.
@@ -268,8 +272,7 @@ class DataAggregator : public DataReader {
                      uint64_t Mispreds);
 
   /// Register a \p Branch.
-  bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds,
-                bool IsPreagg);
+  bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
 
   /// Register a trace between two LBR entries supplied in execution order.
   bool doTrace(const LBREntry &First, const LBREntry &Second,
@@ -298,7 +301,7 @@ class DataAggregator : public DataReader {
   ErrorOr<PerfMemSample> parseMemSample();
 
   /// Parse pre-aggregated LBR samples created by an external tool
-  ErrorOr<AggregatedLBREntry> parseAggregatedLBREntry();
+  std::error_code parseAggregatedLBREntry();
 
   /// Parse either buildid:offset or just offset, representing a location in the
   /// binary. Used exclusively for pre-aggregated LBR samples.
@@ -384,14 +387,15 @@ class DataAggregator : public DataReader {
   /// memory.
   ///
   /// File format syntax:
-  /// {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
-  ///       [<mispred_count>]
+  /// {B|F|f|T} [<start_id>:]<start_offset> [<end_id>:]<end_offset> [<ft_end>]
+  ///       <count> [<mispred_count>]
   ///
   /// B - indicates an aggregated branch
   /// F - an aggregated fall-through
   /// f - an aggregated fall-through with external origin - used to disambiguate
   ///       between a return hitting a basic block head and a regular internal
   ///       jump to the block
+  /// T - an aggregated trace: branch with a fall-through (from, to, ft_end)
   ///
   /// <start_id> - build id of the object containing the start address. We can
   /// skip it for the main binary and use "X" for an unknown object. This will
@@ -402,6 +406,8 @@ class DataAggregator : public DataReader {
   ///
   /// <end_id>, <end_offset> - same for the end address.
   ///
+  /// <ft_end> - same for the fallthrough_end address.
+  ///
   /// <count> - total aggregated count of the branch or a fall-through.
   ///
   /// <mispred_count> - the number of times the branch was mispredicted.
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index de9ec6c1723d5..a859f27569385 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -711,7 +711,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
 }
 
 bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
-                              uint64_t Mispreds, bool IsPreagg) {
+                              uint64_t Mispreds) {
   // Returns whether \p Offset in \p Func contains a return instruction.
   auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
     auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
@@ -772,7 +772,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
     return false;
 
   // Record call to continuation trace.
-  if (IsPreagg && FromFunc != ToFunc && (IsReturn || IsCallCont)) {
+  if (NeedsConvertRetProfileToCallCont && FromFunc != ToFunc &&
+      (IsReturn || IsCallCont)) {
     LBREntry First{ToOrig - 1, ToOrig - 1, false};
     LBREntry Second{ToOrig, ToOrig, false};
     return doTrace(First, Second, Count);
@@ -1216,23 +1217,30 @@ ErrorOr<Location> DataAggregator::parseLocationOrOffset() {
   return Location(true, BuildID.get(), Offset.get());
 }
 
-ErrorOr<DataAggregator::AggregatedLBREntry>
-DataAggregator::parseAggregatedLBREntry() {
+std::error_code DataAggregator::parseAggregatedLBREntry() {
   while (checkAndConsumeFS()) {
   }
 
   ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator);
   if (std::error_code EC = TypeOrErr.getError())
     return EC;
+  // Pre-aggregated profile with branches and fallthroughs needs to convert
+  // return profile into call to continuation fall-through.
   auto Type = AggregatedLBREntry::BRANCH;
   if (TypeOrErr.get() == "B") {
+    NeedsConvertRetProfileToCallCont = true;
     Type = AggregatedLBREntry::BRANCH;
   } else if (TypeOrErr.get() == "F") {
+    NeedsConvertRetProfileToCallCont = true;
     Type = AggregatedLBREntry::FT;
   } else if (TypeOrErr.get() == "f") {
+    NeedsConvertRetProfileToCallCont = true;
     Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
+  } else if (TypeOrErr.get() == "T") {
+    // Trace is expanded into B and [Ff]
+    Type = AggregatedLBREntry::TRACE;
   } else {
-    reportError("expected B, F or f");
+    reportError("expected T, B, F or f");
     return make_error_code(llvm::errc::io_error);
   }
 
@@ -1248,6 +1256,15 @@ DataAggregator::parseAggregatedLBREntry() {
   if (std::error_code EC = To.getError())
     return EC;
 
+  ErrorOr<Location> TraceFtEnd = std::error_code();
+  if (Type == AggregatedLBREntry::TRACE) {
+    while (checkAndConsumeFS()) {
+    }
+    TraceFtEnd = parseLocationOrOffset();
+    if (std::error_code EC = TraceFtEnd.getError())
+      return EC;
+  }
+
   while (checkAndConsumeFS()) {
   }
   ErrorOr<int64_t> Frequency =
@@ -1270,9 +1287,24 @@ DataAggregator::parseAggregatedLBREntry() {
     return make_error_code(llvm::errc::io_error);
   }
 
-  return AggregatedLBREntry{From.get(), To.get(),
-                            static_cast<uint64_t>(Frequency.get()), Mispreds,
-                            Type};
+  BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From->Offset);
+  BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To->Offset);
+
+  for (BinaryFunction *BF : {FromFunc, ToFunc})
+    if (BF)
+      BF->setHasProfileAvailable();
+
+  uint64_t Count = static_cast<uint64_t>(Frequency.get());
+  AggregatedLBREntry Entry{From.get(), To.get(), Count, Mispreds, Type};
+  AggregatedLBRs.emplace_back(Entry);
+  if (Type == AggregatedLBREntry::TRACE) {
+    auto FtType = (FromFunc == ToFunc) ? AggregatedLBREntry::FT
+                                       : AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
+    AggregatedLBREntry TraceFt{To.get(), TraceFtEnd.get(), Count, 0, FtType};
+    AggregatedLBRs.emplace_back(TraceFt);
+  }
+
+  return std::error_code();
 }
 
 bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const {
@@ -1585,8 +1617,7 @@ void DataAggregator::processBranchEvents() {
   for (const auto &AggrLBR : BranchLBRs) {
     const Trace &Loc = AggrLBR.first;
     const TakenBranchInfo &Info = AggrLBR.second;
-    doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount,
-             /*IsPreagg*/ false);
+    doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
   }
 }
 
@@ -1722,18 +1753,10 @@ std::error_code DataAggregator::parsePreAggregatedLBRSamples() {
   outs() << "PERF2BOLT: parsing pre-aggregated profile...\n";
   NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events",
                      TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
-  while (hasData()) {
-    ErrorOr<AggregatedLBREntry> AggrEntry = parseAggregatedLBREntry();
-    if (std::error_code EC = AggrEntry.getError())
+  while (hasData())
+    if (std::error_code EC = parseAggregatedLBREntry())
       return EC;
 
-    for (const uint64_t Addr : {AggrEntry->From.Offset, AggrEntry->To.Offset})
-      if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr))
-        BF->setHasProfileAvailable();
-
-    AggregatedLBRs.emplace_back(std::move(AggrEntry.get()));
-  }
-
   return std::error_code();
 }
 
@@ -1746,8 +1769,9 @@ void DataAggregator::processPreAggregated() {
   for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) {
     switch (AggrEntry.EntryType) {
     case AggregatedLBREntry::BRANCH:
+    case AggregatedLBREntry::TRACE:
       doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count,
-               AggrEntry.Mispreds, /*IsPreagg*/ true);
+               AggrEntry.Mispreds);
       break;
     case AggregatedLBREntry::FT:
     case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: {
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index d76f869c971fd..95cb4c5fc2df4 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -4,19 +4,21 @@
 # RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
 ## Link against a DSO to ensure PLT entries.
 # RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
-# RUN: link_fdata %s %t %t.pa1 PREAGG
+# RUN: link_fdata %s %t %t.pa1 PREAGG1
 # RUN: link_fdata %s %t %t.pa2 PREAGG2
 # RUN: link_fdata %s %t %t.pa3 PREAGG3
-# RUN: link_fdata %s %t %t.pa4 PREAGG4
+# RUN: link_fdata %s %t %t.pat PREAGGT1
+# RUN: link_fdata %s %t %t.pat2 PREAGGT2
 
 ## Check normal case: fallthrough is not LP or secondary entry.
-# RUN: llvm-strip --strip-unneeded %t -o %t.exe
-# RUN: llvm-bolt %t.exe --pa -p %t.pa1 -o %t.out \
+# RUN: llvm-strip --strip-unneeded %t -o %t.strip
+# RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
+# RUN: llvm-bolt %t.strip --pa -p %t.pa1 -o %t.out \
 # RUN:   --print-cfg --print-only=main | FileCheck %s
 
 ## Check that getFallthroughsInTrace correctly handles a trace starting at plt
 ## call continuation
-# RUN: llvm-bolt %t.exe --pa -p %t.pa2 -o %t.out2 \
+# RUN: llvm-bolt %t.strip --pa -p %t.pa2 -o %t.out2 \
 # RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2
 
 ## Check that we don't treat secondary entry points as call continuation sites.
@@ -24,8 +26,21 @@
 # RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
 
 ## Check fallthrough to a landing pad case.
-# RUN: llvm-bolt %t.exe --pa -p %t.pa4 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK4
+# RUN: llvm-bolt %t.strip --pa -p %t.pa3 -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
+
+## Check pre-aggregated traces attach call continuation fallthrough count
+# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s
+
+## Check pre-aggregated traces don't attach call continuation fallthrough count
+## to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
+## Check pre-aggregated traces don't attach call continuation fallthrough count
+## to landing pad (stripped, LP)
+# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
 
   .globl foo
   .type foo, %function
@@ -51,8 +66,9 @@ main:
 	movl	%edi, -0x8(%rbp)
 	movq	%rsi, -0x10(%rbp)
 	callq	puts at PLT
-## Target is a call continuation
-# PREAGG: B X:0 #Ltmp1# 2 0
+## Target is an external-origin call continuation
+# PREAGG1: B X:0 #Ltmp1# 2 0
+# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
 # CHECK:      callq puts at PLT
 # CHECK-NEXT: count: 2
 
@@ -63,14 +79,16 @@ Ltmp1:
 
 Ltmp4:
 	cmpl	$0x0, -0x14(%rbp)
+Ltmp4_br:
 	je	Ltmp0
 # CHECK2:      je .Ltmp0
 # CHECK2-NEXT: count: 3
 
 	movl	$0xa, -0x18(%rbp)
 	callq	foo
-## Target is a call continuation
-# PREAGG: B #Lfoo_ret# #Ltmp3# 1 0
+## Target is a binary-local call continuation
+# PREAGG1: B #Lfoo_ret# #Ltmp3# 1 0
+# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
 # CHECK:      callq foo
 # CHECK-NEXT: count: 1
 
@@ -79,16 +97,12 @@ Ltmp4:
 # CHECK2:      callq foo
 # CHECK2-NEXT: count: 3
 
-## Target is a secondary entry point
+## Target is a secondary entry point (unstripped) or a landing pad (stripped)
 # PREAGG3: B X:0 #Ltmp3# 2 0
+# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
 # CHECK3:      callq foo
 # CHECK3-NEXT: count: 0
 
-## Target is a landing pad
-# PREAGG4: B X:0 #Ltmp3# 2 0
-# CHECK4:      callq puts at PLT
-# CHECK4-NEXT: count: 0
-
 Ltmp3:
 	cmpl	$0x0, -0x18(%rbp)
 Ltmp3_br:
diff --git a/bolt/test/link_fdata.py b/bolt/test/link_fdata.py
index 3837e394ccc87..028823a69ce00 100755
--- a/bolt/test/link_fdata.py
+++ b/bolt/test/link_fdata.py
@@ -34,9 +34,9 @@
 fdata_pat = re.compile(r"([01].*) (?P<exec>\d+) (?P<mispred>\d+)")
 
 # Pre-aggregated profile:
-# {B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> <count>
-# [<mispred_count>]
-preagg_pat = re.compile(r"(?P<type>[BFf]) (?P<offsets_count>.*)")
+# {T|B|F|f} [<start_id>:]<start_offset> [<end_id>:]<end_offset> [<ft_end>]
+# <count> [<mispred_count>]
+preagg_pat = re.compile(r"(?P<type>[TBFf]) (?P<offsets_count>.*)")
 
 # No-LBR profile:
 # <is symbol?> <closest elf symbol or DSO name> <relative address> <count>



More information about the llvm-commits mailing list