[llvm] [AArch64][BOLT] Ensure tentative code layout for cold BBs runs. (PR #96609)
Paschalis Mpeis via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 2 03:12:48 PDT 2024
https://github.com/paschalis-mpeis updated https://github.com/llvm/llvm-project/pull/96609
>From 0df1465d9a00d1e6c1b640eea340e07b06b7081a Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Sun, 23 Jun 2024 19:36:32 +0100
Subject: [PATCH 1/5] BOLT may never run the tentative code layout for cold
BBs.
However, it might later require the addresses of cold BBs to compute
PC-relative jumps between hot and cold fragments of a function. Since
the addresses were never estimated, it uses 0x0 for cold BBs, resulting
in incorrect estimated addresses for any instructions within them.
As a result, LongJump often expands a branch that would otherwise fit in
a single instruction into a short jump.
For example, `relaxStub` might expand from:
```armasm
b .Ltmp1234
```
to:
```armasm
adrp x16, .Ltmp1234
add x16, x16, :lo12:.Ltmp1234
br x16
```
While this expansion is not wrong, it is unnecessary. Moreover, in some
large binaries this expansion has lead to runtime crashes. This should
be investigated further, as such expansions should not cause crashes.
---
bolt/lib/Passes/LongJmp.cpp | 1 +
bolt/test/AArch64/split-funcs-lite.s | 27 +++++++++++++++++++++++++
bolt/test/AArch64/split-funcs-lite.test | 14 +++++++++++++
3 files changed, 42 insertions(+)
create mode 100644 bolt/test/AArch64/split-funcs-lite.s
create mode 100644 bolt/test/AArch64/split-funcs-lite.test
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index c483f70a836ee1..958f99ebe29428 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -382,6 +382,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
DotAddress += Func->estimateConstantIslandSize();
++CurrentIndex;
}
+
// BBs
for (BinaryFunction *Func : SortedFunctions)
tentativeBBLayout(*Func);
diff --git a/bolt/test/AArch64/split-funcs-lite.s b/bolt/test/AArch64/split-funcs-lite.s
new file mode 100644
index 00000000000000..13caeb3fe0d84a
--- /dev/null
+++ b/bolt/test/AArch64/split-funcs-lite.s
@@ -0,0 +1,27 @@
+// This test checks that tentative code layout for cold blocks always runs.
+// It commonly happens when using lite mode with split functions.
+
+// REQUIRES: system-linux
+
+// RUN: %clang %cflags -o %t %s
+// RUN: %clang %s %cflags -Wl,-q -o %t
+// RUN: link_fdata --no-lbr %s %t %t.fdata
+// RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions \
+// RUN: -debug 2>&1 | FileCheck %s
+
+ .text
+ .globl foo
+ .type foo, %function
+foo:
+.entry_bb:
+# FDATA: 1 foo #.entry_bb# 10
+ cmp x0, #0
+ b.eq .Lcold_bb1
+ ret
+.Lcold_bb1:
+ ret
+
+## Force relocations
+.reloc 0, R_AARCH64_NONE
+
+// CHECK: foo{{.*}} cold tentative: {{.*}}
diff --git a/bolt/test/AArch64/split-funcs-lite.test b/bolt/test/AArch64/split-funcs-lite.test
new file mode 100644
index 00000000000000..a254e88773480b
--- /dev/null
+++ b/bolt/test/AArch64/split-funcs-lite.test
@@ -0,0 +1,14 @@
+// This test checks that tentative code layout for cold basic blocks runs
+// at least once, even when each function after the hot/cold frontier is not
+// emittable. This is done by ignoring each function using the 'skip-funcs' flag.
+// In a realistic scenario, this may happen when lite mode is enabled along
+// with a bolt profile.
+
+REQUIRES: system-linux
+
+RUN: %clang %cflags %p/../Inputs/asm_main.c -Wl,-q -o %t
+
+RUN: not --crash llvm-bolt %t -o %t.bolt -lite=1 -split-functions -split-all-cold \
+RUN: --skip-funcs="main,foo" 2>&1 | FileCheck %s
+
+CHECK: Did not perform tentative code layout for cold blocks.
>From 683beaa612495ef17900b2ac4f43f743415b760a Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Mon, 24 Jun 2024 17:43:06 +0100
Subject: [PATCH 2/5] [AArch64][BOLT] Ensure tentative code layout for cold BBs
runs.
When split functions is used, BOLT may skip tentative code layout
estimation in some cases, like:
- when there is no profile data for some blocks (ie cold blocks)
- when there are cold functions in lite mode
- when skip functions is used
However, when rewriting the binary we still need to compute PC-relative
distances between hot and cold basic blocks. Without cold layout
estimation, BOLT uses '0x0' as the address of the first cold block,
leading to incorrect estimations of any PC-relative addresses.
This affects large binaries as the relaxStub method expands more
branches than necessary using the short-jump sequence, at it wrongly
believes it has exceeded the branch distance boundary.
This increases code size with a larger and slower sequence; however,
performance regression is expected to be minimal since this only affects
called cold code.
Example of such an unnecessary relaxation:
from:
```armasm
b .Ltmp1234
```
to:
```armasm
adrp x16, .Ltmp1234
add x16, x16, :lo12:.Ltmp1234
br x16
```
---
bolt/lib/Passes/LongJmp.cpp | 9 ++++++---
bolt/test/AArch64/split-funcs-lite.test | 6 +++---
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 958f99ebe29428..b30137b9b2731a 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "bolt/Passes/LongJmp.h"
+#include "bolt/Utils/CommandLineOpts.h"
#define DEBUG_TYPE "longjmp"
@@ -324,7 +325,6 @@ uint64_t LongJmpPass::tentativeLayoutRelocColdPart(
uint64_t LongJmpPass::tentativeLayoutRelocMode(
const BinaryContext &BC, std::vector<BinaryFunction *> &SortedFunctions,
uint64_t DotAddress) {
-
// Compute hot cold frontier
uint32_t LastHotIndex = -1u;
uint32_t CurrentIndex = 0;
@@ -354,9 +354,12 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
for (BinaryFunction *Func : SortedFunctions) {
if (!BC.shouldEmit(*Func)) {
HotAddresses[Func] = Func->getAddress();
- continue;
+ // Don't perform any tentative address estimation of a function's cold
+ // layout if it won't be emitted, unless we are ignoring a large number of
+ // functions (ie, on lite mode) and we haven't done such estimation yet.
+ if (opts::processAllFunctions() || ColdLayoutDone)
+ continue;
}
-
if (!ColdLayoutDone && CurrentIndex >= LastHotIndex) {
DotAddress =
tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress);
diff --git a/bolt/test/AArch64/split-funcs-lite.test b/bolt/test/AArch64/split-funcs-lite.test
index a254e88773480b..a0dbacee29d303 100644
--- a/bolt/test/AArch64/split-funcs-lite.test
+++ b/bolt/test/AArch64/split-funcs-lite.test
@@ -8,7 +8,7 @@ REQUIRES: system-linux
RUN: %clang %cflags %p/../Inputs/asm_main.c -Wl,-q -o %t
-RUN: not --crash llvm-bolt %t -o %t.bolt -lite=1 -split-functions -split-all-cold \
-RUN: --skip-funcs="main,foo" 2>&1 | FileCheck %s
+RUN: llvm-bolt %t -o %t.bolt -lite=1 -split-functions -split-all-cold \
+RUN: --skip-funcs="_init,_start,call_weak_fn/1,deregister_tm_clones/1,register_tm_clones/1,__do_global_dtors_aux/1,frame_dummy/1,main,foo,_fini" 2>&1 | FileCheck %s
-CHECK: Did not perform tentative code layout for cold blocks.
+CHECK-NOT: Did not perform tentative code layout for cold blocks.
>From 8cc07ed1d76dd2609ea62af3a2ab76b8ca234774 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Sat, 28 Sep 2024 19:20:42 +0100
Subject: [PATCH 3/5] Addressing reviewers.
---
bolt/lib/Passes/LongJmp.cpp | 31 ++++++++++++++-----------
bolt/test/AArch64/split-funcs-lite.test | 2 +-
2 files changed, 18 insertions(+), 15 deletions(-)
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index b30137b9b2731a..75e049825947ef 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -11,7 +11,6 @@
//===----------------------------------------------------------------------===//
#include "bolt/Passes/LongJmp.h"
-#include "bolt/Utils/CommandLineOpts.h"
#define DEBUG_TYPE "longjmp"
@@ -326,7 +325,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
const BinaryContext &BC, std::vector<BinaryFunction *> &SortedFunctions,
uint64_t DotAddress) {
// Compute hot cold frontier
- uint32_t LastHotIndex = -1u;
+ int64_t LastHotIndex = -1u;
uint32_t CurrentIndex = 0;
if (opts::HotFunctionsAtEnd) {
for (BinaryFunction *BF : SortedFunctions) {
@@ -351,23 +350,21 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
// Hot
CurrentIndex = 0;
bool ColdLayoutDone = false;
+ auto runColdLayout = [&]() {
+ DotAddress = tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress);
+ ColdLayoutDone = true;
+ if (opts::HotFunctionsAtEnd)
+ DotAddress = alignTo(DotAddress, opts::AlignText);
+ };
for (BinaryFunction *Func : SortedFunctions) {
if (!BC.shouldEmit(*Func)) {
HotAddresses[Func] = Func->getAddress();
- // Don't perform any tentative address estimation of a function's cold
- // layout if it won't be emitted, unless we are ignoring a large number of
- // functions (ie, on lite mode) and we haven't done such estimation yet.
- if (opts::processAllFunctions() || ColdLayoutDone)
- continue;
- }
- if (!ColdLayoutDone && CurrentIndex >= LastHotIndex) {
- DotAddress =
- tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress);
- ColdLayoutDone = true;
- if (opts::HotFunctionsAtEnd)
- DotAddress = alignTo(DotAddress, opts::AlignText);
+ continue;
}
+ if (!ColdLayoutDone && CurrentIndex >= LastHotIndex)
+ runColdLayout();
+
DotAddress = alignTo(DotAddress, Func->getMinAlignment());
uint64_t Pad =
offsetToAlignment(DotAddress, llvm::Align(Func->getAlignment()));
@@ -386,6 +383,12 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
++CurrentIndex;
}
+ // Ensure that tentative code layout always runs for cold blocks.
+ if (!ColdLayoutDone)
+ runColdLayout();
+ assert(ColdLayoutDone &&
+ "Did not perform tentative code layout for cold blocks.");
+
// BBs
for (BinaryFunction *Func : SortedFunctions)
tentativeBBLayout(*Func);
diff --git a/bolt/test/AArch64/split-funcs-lite.test b/bolt/test/AArch64/split-funcs-lite.test
index a0dbacee29d303..835fd248ce9ebe 100644
--- a/bolt/test/AArch64/split-funcs-lite.test
+++ b/bolt/test/AArch64/split-funcs-lite.test
@@ -9,6 +9,6 @@ REQUIRES: system-linux
RUN: %clang %cflags %p/../Inputs/asm_main.c -Wl,-q -o %t
RUN: llvm-bolt %t -o %t.bolt -lite=1 -split-functions -split-all-cold \
-RUN: --skip-funcs="_init,_start,call_weak_fn/1,deregister_tm_clones/1,register_tm_clones/1,__do_global_dtors_aux/1,frame_dummy/1,main,foo,_fini" 2>&1 | FileCheck %s
+RUN: --skip-funcs="main,foo" 2>&1 | FileCheck %s
CHECK-NOT: Did not perform tentative code layout for cold blocks.
>From 104f14bb309e38af9931f6ebc34cd764e457a997 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Wed, 2 Oct 2024 10:13:24 +0100
Subject: [PATCH 4/5] Addressing reviewers - 02
---
bolt/lib/Passes/LongJmp.cpp | 2 --
bolt/test/AArch64/split-funcs-lite.test | 14 --------------
2 files changed, 16 deletions(-)
delete mode 100644 bolt/test/AArch64/split-funcs-lite.test
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 75e049825947ef..0b2d00300f46b9 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -386,8 +386,6 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
// Ensure that tentative code layout always runs for cold blocks.
if (!ColdLayoutDone)
runColdLayout();
- assert(ColdLayoutDone &&
- "Did not perform tentative code layout for cold blocks.");
// BBs
for (BinaryFunction *Func : SortedFunctions)
diff --git a/bolt/test/AArch64/split-funcs-lite.test b/bolt/test/AArch64/split-funcs-lite.test
deleted file mode 100644
index 835fd248ce9ebe..00000000000000
--- a/bolt/test/AArch64/split-funcs-lite.test
+++ /dev/null
@@ -1,14 +0,0 @@
-// This test checks that tentative code layout for cold basic blocks runs
-// at least once, even when each function after the hot/cold frontier is not
-// emittable. This is done by ignoring each function using the 'skip-funcs' flag.
-// In a realistic scenario, this may happen when lite mode is enabled along
-// with a bolt profile.
-
-REQUIRES: system-linux
-
-RUN: %clang %cflags %p/../Inputs/asm_main.c -Wl,-q -o %t
-
-RUN: llvm-bolt %t -o %t.bolt -lite=1 -split-functions -split-all-cold \
-RUN: --skip-funcs="main,foo" 2>&1 | FileCheck %s
-
-CHECK-NOT: Did not perform tentative code layout for cold blocks.
>From 59023c90996b55e95af526f5706ff0056bec5b2b Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis at arm.com>
Date: Wed, 2 Oct 2024 11:10:43 +0100
Subject: [PATCH 5/5] Addressing reviewers (3): fix indentation
---
bolt/test/AArch64/split-funcs-lite.s | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/bolt/test/AArch64/split-funcs-lite.s b/bolt/test/AArch64/split-funcs-lite.s
index 13caeb3fe0d84a..ce6ba47dca2fc8 100644
--- a/bolt/test/AArch64/split-funcs-lite.s
+++ b/bolt/test/AArch64/split-funcs-lite.s
@@ -6,8 +6,8 @@
// RUN: %clang %cflags -o %t %s
// RUN: %clang %s %cflags -Wl,-q -o %t
// RUN: link_fdata --no-lbr %s %t %t.fdata
-// RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions \
-// RUN: -debug 2>&1 | FileCheck %s
+// RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions \
+// RUN: -debug 2>&1 | FileCheck %s
.text
.globl foo
More information about the llvm-commits
mailing list