[llvm] [BOLT] Deduplicate equal offsets in BAT (PR #78638)

Amir Ayupov via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 18 14:27:21 PST 2024


https://github.com/aaupov created https://github.com/llvm/llvm-project/pull/78638

Encode BRANCHENTRY bits as bitmask for deduplicated entries.

Reduces BAT section size:
- large binary: to 11834216 bytes (0.31x original),
- medium binary: to 1565584 bytes (0.26x original),
- small binary: to 336 bytes (0.23x original).

Test Plan: Updated bolt/test/X86/bolt-address-translation.test


>From c1e688df84b4e90544e80d5ce11b2ef71c11d3af Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Thu, 18 Jan 2024 14:27:11 -0800
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.4
---
 bolt/docs/BAT.md                              | 13 ++-
 .../bolt/Profile/BoltAddressTranslation.h     |  8 ++
 bolt/lib/Profile/BoltAddressTranslation.cpp   | 91 +++++++++++++++++--
 bolt/test/X86/bolt-address-translation.test   |  2 +-
 4 files changed, 103 insertions(+), 11 deletions(-)

diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index 0a2c878ef4ae928..d1cab984d148342 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -61,7 +61,7 @@ Cold functions table header
 ```
 
 ### Functions table
-Hot and cold functions tables share the encoding except difference marked below.
+Hot and cold functions tables share the encoding except differences marked below.
 Header:
 | Entry  | Encoding | Description |
 | ------ | ----- | ----------- |
@@ -80,9 +80,12 @@ Hot indices are delta encoded, implicitly starting at zero.
 | `Address` | Continuous, Delta, ULEB128 | Function address in the output binary |
 | `HotIndex` | Delta, ULEB128 | Cold functions only: index of corresponding hot function in hot functions table |
 | `NumEntries` | ULEB128 | Number of address translation entries for a function |
+| `EqualElems` | ULEB128 | Hot functions only: number of equal offsets in the beginning of a function |
+| `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | Hot functions only: if `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit |
 
-Function header is followed by `NumEntries` pairs of offsets for current
-function.
+Function header is followed by `EqualElems` offsets (hot functions only) and
+`NumEntries-EqualElems` (`NumEntries` for cold functions) pairs of offsets for
+current function.
 
 ### Address translation table
 Delta encoding means that only the difference with the previous corresponding
@@ -90,8 +93,10 @@ entry is encoded. Input offsets implicitly start at zero.
 | Entry  | Encoding | Description |
 | ------ | ------| ----------- |
 | `OutputOffset` | Continuous, Delta, ULEB128 | Function offset in output binary |
-| `InputOffset` | Delta, SLEB128 | Function offset in input binary with `BRANCHENTRY` LSB bit |
+| `InputOffset` | Optional, Delta, SLEB128 | Function offset in input binary with `BRANCHENTRY` LSB bit |
 
 `BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
 (branch or call instruction). If not set, it signifies a control flow target
 (basic block offset).
+`InputAddr` is omitted for equal offsets in input and output function. In this
+case, `BRANCHENTRY` bits are encoded separately in a `BranchEntries` bitvector.
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index f6bd61bc8898735..fa29ece3287a9a4 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -130,6 +130,14 @@ class BoltAddressTranslation {
   void parseMaps(std::vector<uint64_t> &HotFuncs, uint64_t &PrevAddress,
                  DataExtractor &DE, uint64_t &Offset, Error &Err);
 
+  /// Returns the bitmask with set bits corresponding to indices of BRANCHENTRY
+  /// entries in function address translation map.
+  APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems);
+
+  /// Calculate the number of equal offsets (output = input) in the beginning
+  /// of the function.
+  size_t getNumEqualOffsets(const MapTy &Map) const;
+
   std::map<uint64_t, MapTy> Maps;
 
   /// Links outlined cold bocks to their original function
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index d3c33d6e6bc796d..6901e482ddfd4e8 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -8,6 +8,7 @@
 
 #include "bolt/Profile/BoltAddressTranslation.h"
 #include "bolt/Core/BinaryFunction.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/LEB128.h"
@@ -110,6 +111,34 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
   outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
 }
 
+APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
+                                                            size_t EqualElems) {
+  APInt BitMask(alignTo(EqualElems, 8), 0);
+  size_t Index = 0;
+  for (std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
+    if (Index == EqualElems)
+      break;
+    const uint32_t OutputOffset = KeyVal.second;
+    if (OutputOffset & BRANCHENTRY)
+      BitMask.setBit(Index);
+    ++Index;
+  }
+  return BitMask;
+}
+
+size_t BoltAddressTranslation::getNumEqualOffsets(const MapTy &Map) const {
+  size_t EqualOffsets = 0;
+  for (const std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
+    const uint32_t OutputOffset = KeyVal.first;
+    const uint32_t InputOffset = KeyVal.second >> 1;
+    if (OutputOffset == InputOffset)
+      ++EqualOffsets;
+    else
+      break;
+  }
+  return EqualOffsets;
+}
+
 template <bool Cold>
 void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
                                        uint64_t &PrevAddress, raw_ostream &OS) {
@@ -139,14 +168,35 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
       PrevIndex = HotIndex;
     }
     encodeULEB128(NumEntries, OS);
+    // For hot fragments only: encode the number of equal offsets
+    // (output = input) in the beginning of the function. Only encode one offset
+    // in these cases.
+    const size_t EqualElems = Cold ? 0 : getNumEqualOffsets(Map);
+    if (!Cold) {
+      encodeULEB128(EqualElems, OS);
+      if (EqualElems) {
+        const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
+        APInt BranchEntries = calculateBranchEntriesBitMask(Map, EqualElems);
+        OS.write(reinterpret_cast<const char *>(BranchEntries.getRawData()),
+                 BranchEntriesBytes);
+        LLVM_DEBUG({
+          dbgs() << "BranchEntries: ";
+          SmallString<8> BitMaskStr;
+          BranchEntries.toString(BitMaskStr, 2, false);
+          dbgs() << BitMaskStr << '\n';
+        });
+      }
+    }
+    size_t Index = 0;
     uint64_t InOffset = 0;
     // Output and Input addresses and delta-encoded
     for (std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
       const uint64_t OutputAddress = KeyVal.first + Address;
       encodeULEB128(OutputAddress - PrevAddress, OS);
       PrevAddress = OutputAddress;
-      encodeSLEB128(KeyVal.second - InOffset, OS);
-      InOffset = KeyVal.second;
+      if (Index++ >= EqualElems)
+        encodeSLEB128(KeyVal.second - InOffset, OS);
+      InOffset = KeyVal.second; // Keeping InOffset as if BRANCHENTRY is encoded
     }
   }
 }
@@ -197,6 +247,29 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
       HotFuncs.push_back(Address);
     }
     const uint32_t NumEntries = DE.getULEB128(&Offset, &Err);
+    // Equal offsets, hot fragments only.
+    size_t EqualElems = 0;
+    APInt BEBitMask;
+    if (!Cold) {
+      EqualElems = DE.getULEB128(&Offset, &Err);
+      LLVM_DEBUG(dbgs() << formatv("Equal offsets: {0}, {1} bytes\n",
+                                   EqualElems, getULEB128Size(EqualElems)));
+      if (EqualElems) {
+        const size_t BranchEntriesBytes = alignTo(EqualElems, 8) / 8;
+        BEBitMask = APInt(alignTo(EqualElems, 8), 0);
+        LoadIntFromMemory(
+            BEBitMask,
+            reinterpret_cast<const uint8_t *>(
+                DE.getBytes(&Offset, BranchEntriesBytes, &Err).data()),
+            BranchEntriesBytes);
+        LLVM_DEBUG({
+          dbgs() << "BEBitMask: ";
+          SmallString<8> BitMaskStr;
+          BEBitMask.toString(BitMaskStr, 2, false);
+          dbgs() << BitMaskStr << ", " << BranchEntriesBytes << " bytes\n";
+        });
+      }
+    }
     MapTy Map;
 
     LLVM_DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x"
@@ -207,14 +280,20 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
       const uint64_t OutputAddress = PrevAddress + OutputDelta;
       const uint64_t OutputOffset = OutputAddress - Address;
       PrevAddress = OutputAddress;
-      const int64_t InputDelta = DE.getSLEB128(&Offset, &Err);
-      InputOffset += InputDelta;
+      int64_t InputDelta = 0;
+      if (J < EqualElems) {
+        InputOffset = (OutputOffset << 1) | BEBitMask[J];
+      } else {
+        InputDelta = DE.getSLEB128(&Offset, &Err);
+        InputOffset += InputDelta;
+      }
       Map.insert(std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset));
       LLVM_DEBUG(
           dbgs() << formatv("{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b), {6:x}\n",
                             OutputOffset, InputOffset, OutputDelta,
-                            encodeULEB128(OutputDelta, nulls()), InputDelta,
-                            encodeSLEB128(InputDelta, nulls()), OutputAddress));
+                            getULEB128Size(OutputDelta), InputDelta,
+                            (J < EqualElems) ? 0 : getSLEB128Size(InputDelta),
+                            OutputAddress));
     }
     Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
   }
diff --git a/bolt/test/X86/bolt-address-translation.test b/bolt/test/X86/bolt-address-translation.test
index 430b4cb007310fa..f2020af2edebde9 100644
--- a/bolt/test/X86/bolt-address-translation.test
+++ b/bolt/test/X86/bolt-address-translation.test
@@ -36,7 +36,7 @@
 #
 # CHECK:      BOLT: 3 out of 7 functions were overwritten.
 # CHECK:      BOLT-INFO: Wrote 6 BAT maps
-# CHECK:      BOLT-INFO: BAT section size (bytes): 404
+# CHECK:      BOLT-INFO: BAT section size (bytes): 336
 #
 # usqrt mappings (hot part). We match against any key (left side containing
 # the bolted binary offsets) because BOLT may change where it puts instructions



More information about the llvm-commits mailing list