[llvm] [BOLT][AArch64] Enabling Inlining for Memcpy for AArch64 in BOLT (PR #154929)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 5 09:19:52 PDT 2025
https://github.com/yafet-a updated https://github.com/llvm/llvm-project/pull/154929
>From ce56f84aa7c86e1b35cf0ca4218a1f23702a206e Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 10:12:03 -0700
Subject: [PATCH 01/26] pre-commit test
---
bolt/test/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++++++++++
1 file changed, 193 insertions(+)
create mode 100644 bolt/test/AArch64/inline-memcpy.s
diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
new file mode 100644
index 0000000000000..3bb498e600fb6
--- /dev/null
+++ b/bolt/test/AArch64/inline-memcpy.s
@@ -0,0 +1,193 @@
+## This test checks that BOLT correctly inlines memcpy calls on AArch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
+
+# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+
+# Each function should use optimal size-specific instructions and NO memcpy calls
+
+# 1-byte copy should use single byte load/store (ldrb/strb)
+# CHECK-ASM-LABEL: <test_1_byte_direct>:
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 2-byte copy should use single 16-bit load/store (ldrh/strh)
+# CHECK-ASM-LABEL: <test_2_byte_direct>:
+# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 4-byte copy should use single 32-bit load/store (w register)
+# CHECK-ASM-LABEL: <test_4_byte_direct>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 8-byte copy should use single 64-bit load/store (x register)
+# CHECK-ASM-LABEL: <test_8_byte_direct>:
+# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 16-byte copy should use single 128-bit SIMD load/store (q register)
+# CHECK-ASM-LABEL: <test_16_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 32-byte copy should use two 128-bit SIMD operations
+# CHECK-ASM-LABEL: <test_32_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
+# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# CHECK-ASM-LABEL: <test_128_byte_too_large>:
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+
+ .text
+ .globl test_1_byte_direct
+ .type test_1_byte_direct, at function
+test_1_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #1
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_1_byte_direct, .-test_1_byte_direct
+
+ .globl test_2_byte_direct
+ .type test_2_byte_direct, at function
+test_2_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #2
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_2_byte_direct, .-test_2_byte_direct
+
+ .globl test_4_byte_direct
+ .type test_4_byte_direct, at function
+test_4_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #4
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_4_byte_direct, .-test_4_byte_direct
+
+ .globl test_8_byte_direct
+ .type test_8_byte_direct, at function
+test_8_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #8
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_8_byte_direct, .-test_8_byte_direct
+
+ .globl test_16_byte_direct
+ .type test_16_byte_direct, at function
+test_16_byte_direct:
+ stp x29, x30, [sp, #-48]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #32
+ mov x2, #16
+ bl memcpy
+ ldp x29, x30, [sp], #48
+ ret
+ .size test_16_byte_direct, .-test_16_byte_direct
+
+ .globl test_32_byte_direct
+ .type test_32_byte_direct, at function
+test_32_byte_direct:
+ stp x29, x30, [sp, #-80]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #48
+ mov x2, #32
+ bl memcpy
+ ldp x29, x30, [sp], #80
+ ret
+ .size test_32_byte_direct, .-test_32_byte_direct
+
+ .globl test_37_byte_arbitrary
+ .type test_37_byte_arbitrary, at function
+test_37_byte_arbitrary:
+ stp x29, x30, [sp, #-96]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #56
+ mov x2, #37
+ bl memcpy
+ ldp x29, x30, [sp], #96
+ ret
+ .size test_37_byte_arbitrary, .-test_37_byte_arbitrary
+
+ .globl test_128_byte_too_large
+ .type test_128_byte_too_large, at function
+test_128_byte_too_large:
+ stp x29, x30, [sp, #-288]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #152
+ mov x2, #128
+ bl memcpy
+ ldp x29, x30, [sp], #288
+ ret
+ .size test_128_byte_too_large, .-test_128_byte_too_large
+
+ .globl main
+ .type main, at function
+main:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ bl test_1_byte_direct
+ bl test_2_byte_direct
+ bl test_4_byte_direct
+ bl test_8_byte_direct
+ bl test_16_byte_direct
+ bl test_32_byte_direct
+ bl test_37_byte_arbitrary
+ bl test_128_byte_too_large
+
+ mov w0, #0
+ ldp x29, x30, [sp], #16
+ ret
+ .size main, .-main
>From 1c27d8967a1938cea4e9bf3110362cb91d7b3bbb Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 10:17:40 -0700
Subject: [PATCH 02/26] [BOLT] documentation
---
bolt/docs/CommandLineArgumentReference.md | 2 +-
bolt/lib/Rewrite/BinaryPassManager.cpp | 4 +++-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index f3881c9a640a9..3fc0594514f6e 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -631,7 +631,7 @@
- `--inline-memcpy`
- Inline memcpy using 'rep movsb' instruction (X86-only)
+ Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)
- `--inline-small-functions`
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 996d2e972599d..6b554598cf1bc 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -247,7 +247,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),
static cl::opt<bool> StringOps(
"inline-memcpy",
- cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
+ cl::desc(
+ "inline memcpy using size-specific optimized instructions "
+ "(X86: 'rep movsb', AArch64: width-optimized register operations)"),
cl::cat(BoltOptCategory));
static cl::opt<bool> StripRepRet(
>From db353b759b298aed2e0ebf86f99d6049a5a62e12 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 11:25:05 -0700
Subject: [PATCH 03/26] [BOLT][AArch64] Implement safe size-aware memcpy
inlining
---
bolt/include/bolt/Core/MCPlusBuilder.h | 16 ++
bolt/lib/Passes/BinaryPasses.cpp | 28 ++-
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 204 ++++++++++++++++++
3 files changed, 246 insertions(+), 2 deletions(-)
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index e773250ce8734..6cbf288f3b8f4 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1895,6 +1895,22 @@ class MCPlusBuilder {
return {};
}
+ /// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
+ /// generates optimized code for that specific size. Falls back to regular
+ /// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
+ virtual InstructionListType
+ createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
+ return createInlineMemcpy(ReturnEnd);
+ }
+
+ /// Extract immediate value from move instruction that sets the given
+ /// register. Returns the immediate value if the instruction is a
+ /// move-immediate to TargetReg.
+ virtual std::optional<uint64_t>
+ extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
+ return std::nullopt;
+ }
+
/// Create a target-specific relocation out of the \p Fixup.
/// Note that not every fixup could be converted into a relocation.
virtual std::optional<Relocation>
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d7f02b9470030..0068c1ad0bf1c 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
}
Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
- if (!BC.isX86())
+ if (!BC.isX86() && !BC.isAArch64())
return Error::success();
uint64_t NumInlined = 0;
@@ -1866,8 +1866,32 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
const bool IsTailCall = BC.MIB->isTailCall(Inst);
+ // Extract the size of thecopy from preceding instructions by looking
+ // for writes to the size register
+ std::optional<uint64_t> KnownSize = std::nullopt;
+ BitVector WrittenRegs(BC.MRI->getNumRegs());
+
+ // Get the size register (3rd arg register, index 2 for AArch64)
+ MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+
+ // Look backwards through the basic block for size-setting instr
+ for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
+ MCInst &Inst = *InstIt;
+ WrittenRegs.reset(); // Clear and check what the instruction writes to
+ BC.MIB->getWrittenRegs(Inst, WrittenRegs);
+
+ // Check for writes to the size register
+ if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
+ if (std::optional<uint64_t> ExtractedSize =
+ BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
+ KnownSize = *ExtractedSize;
+ break;
+ }
+ }
+ }
+
const InstructionListType NewCode =
- BC.MIB->createInlineMemcpy(IsMemcpy8);
+ BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
II = BB.replaceInstruction(II, NewCode);
std::advance(II, NewCode.size() - 1);
if (IsTailCall) {
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 973261765f951..03f62117ea096 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2597,6 +2597,210 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
getInstructionSize(const MCInst &Inst) const override {
return 4;
}
+
+ InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
+ // Fallback
+ return createInlineMemcpy(ReturnEnd, std::nullopt);
+ }
+
+ std::optional<uint64_t>
+ extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
+ if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) {
+ if (Inst.getOperand(0).isReg() &&
+ Inst.getOperand(0).getReg() == TargetReg &&
+ Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
+ Inst.getOperand(2).getImm() == 0) {
+ return Inst.getOperand(1).getImm();
+ }
+ }
+ return std::nullopt;
+ }
+
+ InstructionListType
+ createInlineMemcpy(bool ReturnEnd,
+ std::optional<uint64_t> KnownSize) const override {
+ InstructionListType Code;
+ if (ReturnEnd) {
+ if (KnownSize.has_value() && (*KnownSize >> 12) == 0) {
+ // Use immediate if size is known and fits in 12-bit immediate (0-4095)
+ Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::X0)
+ .addImm(*KnownSize)
+ .addImm(0));
+ } else {
+ // Fall back to register add for unknown or large sizes
+ Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::X2));
+ }
+ }
+
+ if (!KnownSize.has_value()) {
+ return Code;
+ }
+
+ uint64_t Size = *KnownSize;
+ return generateSizeSpecificMemcpy(Code, Size);
+ }
+
+ InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
+ uint64_t Size) const {
+ // Generate optimal instruction sequences based on exact size
+ switch (Size) {
+ case 1:
+ // Single byte copy
+ Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 2:
+ // 2-byte copy using 16-bit load/store
+ Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 4:
+ // 4-byte copy using 32-bit load/store
+ Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRWui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 8:
+ // 8-byte copy using 64-bit load/store
+ Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
+ .addReg(AArch64::X3)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRXui)
+ .addReg(AArch64::X3)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 16:
+ // 16-byte copy using 128-bit SIMD
+ Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 32:
+ // 32-byte copy using two 128-bit SIMD operations
+ Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+ .addReg(AArch64::Q1)
+ .addReg(AArch64::X1)
+ .addImm(1));
+ Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+ .addReg(AArch64::Q1)
+ .addReg(AArch64::X0)
+ .addImm(1));
+ break;
+
+ default:
+ if (Size <= 64) {
+ // For sizes up to 64 bytes, greedily use the largest possible loads in
+ // descending order
+ uint64_t Remaining = Size;
+ uint64_t Offset = 0;
+
+ while (Remaining >= 16) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X1)
+ .addImm(Offset / 16));
+ Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X0)
+ .addImm(Offset / 16));
+ Remaining -= 16;
+ Offset += 16;
+ }
+ if (Remaining >= 8) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
+ .addReg(AArch64::X3)
+ .addReg(AArch64::X1)
+ .addImm(Offset / 8));
+ Code.emplace_back(MCInstBuilder(AArch64::STRXui)
+ .addReg(AArch64::X3)
+ .addReg(AArch64::X0)
+ .addImm(Offset / 8));
+ Remaining -= 8;
+ Offset += 8;
+ }
+ if (Remaining >= 4) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(Offset / 4));
+ Code.emplace_back(MCInstBuilder(AArch64::STRWui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(Offset / 4));
+ Remaining -= 4;
+ Offset += 4;
+ }
+ if (Remaining >= 2) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(Offset / 2));
+ Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(Offset / 2));
+ Remaining -= 2;
+ Offset += 2;
+ }
+ if (Remaining == 1) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(Offset));
+ Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(Offset));
+ }
+ } else {
+ Code.clear();
+ }
+ break;
+ }
+ return Code;
+ }
};
} // end anonymous namespace
>From 2e5b22b501a83796ff10ae30520e07cb44b21332 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:14:11 -0700
Subject: [PATCH 04/26] test target fix for CI cross-compilation issue
---
bolt/test/AArch64/inline-memcpy.s | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
index 3bb498e600fb6..e46308286e07b 100644
--- a/bolt/test/AArch64/inline-memcpy.s
+++ b/bolt/test/AArch64/inline-memcpy.s
@@ -1,6 +1,6 @@
## This test checks that BOLT correctly inlines memcpy calls on AArch64.
-# REQUIRES: system-linux
+# REQUIRES: system-linux, aarch64-registered-target
# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q
>From 385fa23691e05fbdb6ffb24cc6a9526ff8d08020 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:49:37 -0700
Subject: [PATCH 05/26] moved inline-memcpy to avoid CI cross-compilation PIE
conflicts
---
bolt/test/runtime/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++
1 file changed, 193 insertions(+)
create mode 100644 bolt/test/runtime/AArch64/inline-memcpy.s
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
new file mode 100644
index 0000000000000..0e16b6a7e963f
--- /dev/null
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -0,0 +1,193 @@
+## This test checks that BOLT correctly inlines memcpy calls on AArch64.
+
+# REQUIRES: system-linux, aarch64-registered-target
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
+
+# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+
+# Each function should use optimal size-specific instructions and NO memcpy calls
+
+# 1-byte copy should use single byte load/store (ldrb/strb)
+# CHECK-ASM-LABEL: <test_1_byte_direct>:
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 2-byte copy should use single 16-bit load/store (ldrh/strh)
+# CHECK-ASM-LABEL: <test_2_byte_direct>:
+# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 4-byte copy should use single 32-bit load/store (w register)
+# CHECK-ASM-LABEL: <test_4_byte_direct>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 8-byte copy should use single 64-bit load/store (x register)
+# CHECK-ASM-LABEL: <test_8_byte_direct>:
+# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 16-byte copy should use single 128-bit SIMD load/store (q register)
+# CHECK-ASM-LABEL: <test_16_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 32-byte copy should use two 128-bit SIMD operations
+# CHECK-ASM-LABEL: <test_32_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
+# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# CHECK-ASM-LABEL: <test_128_byte_too_large>:
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+
+ .text
+ .globl test_1_byte_direct
+ .type test_1_byte_direct, at function
+test_1_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #1
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_1_byte_direct, .-test_1_byte_direct
+
+ .globl test_2_byte_direct
+ .type test_2_byte_direct, at function
+test_2_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #2
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_2_byte_direct, .-test_2_byte_direct
+
+ .globl test_4_byte_direct
+ .type test_4_byte_direct, at function
+test_4_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #4
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_4_byte_direct, .-test_4_byte_direct
+
+ .globl test_8_byte_direct
+ .type test_8_byte_direct, at function
+test_8_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #8
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_8_byte_direct, .-test_8_byte_direct
+
+ .globl test_16_byte_direct
+ .type test_16_byte_direct, at function
+test_16_byte_direct:
+ stp x29, x30, [sp, #-48]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #32
+ mov x2, #16
+ bl memcpy
+ ldp x29, x30, [sp], #48
+ ret
+ .size test_16_byte_direct, .-test_16_byte_direct
+
+ .globl test_32_byte_direct
+ .type test_32_byte_direct, at function
+test_32_byte_direct:
+ stp x29, x30, [sp, #-80]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #48
+ mov x2, #32
+ bl memcpy
+ ldp x29, x30, [sp], #80
+ ret
+ .size test_32_byte_direct, .-test_32_byte_direct
+
+ .globl test_37_byte_arbitrary
+ .type test_37_byte_arbitrary, at function
+test_37_byte_arbitrary:
+ stp x29, x30, [sp, #-96]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #56
+ mov x2, #37
+ bl memcpy
+ ldp x29, x30, [sp], #96
+ ret
+ .size test_37_byte_arbitrary, .-test_37_byte_arbitrary
+
+ .globl test_128_byte_too_large
+ .type test_128_byte_too_large, at function
+test_128_byte_too_large:
+ stp x29, x30, [sp, #-288]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #152
+ mov x2, #128
+ bl memcpy
+ ldp x29, x30, [sp], #288
+ ret
+ .size test_128_byte_too_large, .-test_128_byte_too_large
+
+ .globl main
+ .type main, at function
+main:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ bl test_1_byte_direct
+ bl test_2_byte_direct
+ bl test_4_byte_direct
+ bl test_8_byte_direct
+ bl test_16_byte_direct
+ bl test_32_byte_direct
+ bl test_37_byte_arbitrary
+ bl test_128_byte_too_large
+
+ mov w0, #0
+ ldp x29, x30, [sp], #16
+ ret
+ .size main, .-main
>From 4f9ef678f0d07e23a362cf28805749d53bc8b0b5 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:56:47 -0700
Subject: [PATCH 06/26] removed old test
---
bolt/test/AArch64/inline-memcpy.s | 193 ------------------------------
1 file changed, 193 deletions(-)
delete mode 100644 bolt/test/AArch64/inline-memcpy.s
diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
deleted file mode 100644
index e46308286e07b..0000000000000
--- a/bolt/test/AArch64/inline-memcpy.s
+++ /dev/null
@@ -1,193 +0,0 @@
-## This test checks that BOLT correctly inlines memcpy calls on AArch64.
-
-# REQUIRES: system-linux, aarch64-registered-target
-
-# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
-# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
-# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-
-# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
-# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
-
-# Each function should use optimal size-specific instructions and NO memcpy calls
-
-# 1-byte copy should use single byte load/store (ldrb/strb)
-# CHECK-ASM-LABEL: <test_1_byte_direct>:
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 2-byte copy should use single 16-bit load/store (ldrh/strh)
-# CHECK-ASM-LABEL: <test_2_byte_direct>:
-# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 4-byte copy should use single 32-bit load/store (w register)
-# CHECK-ASM-LABEL: <test_4_byte_direct>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 8-byte copy should use single 64-bit load/store (x register)
-# CHECK-ASM-LABEL: <test_8_byte_direct>:
-# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 16-byte copy should use single 128-bit SIMD load/store (q register)
-# CHECK-ASM-LABEL: <test_16_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 32-byte copy should use two 128-bit SIMD operations
-# CHECK-ASM-LABEL: <test_32_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
-# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
-# CHECK-ASM-LABEL: <test_128_byte_too_large>:
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
-
- .text
- .globl test_1_byte_direct
- .type test_1_byte_direct, at function
-test_1_byte_direct:
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #8
- mov x2, #1
- bl memcpy
- ldp x29, x30, [sp], #32
- ret
- .size test_1_byte_direct, .-test_1_byte_direct
-
- .globl test_2_byte_direct
- .type test_2_byte_direct, at function
-test_2_byte_direct:
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #8
- mov x2, #2
- bl memcpy
- ldp x29, x30, [sp], #32
- ret
- .size test_2_byte_direct, .-test_2_byte_direct
-
- .globl test_4_byte_direct
- .type test_4_byte_direct, at function
-test_4_byte_direct:
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #8
- mov x2, #4
- bl memcpy
- ldp x29, x30, [sp], #32
- ret
- .size test_4_byte_direct, .-test_4_byte_direct
-
- .globl test_8_byte_direct
- .type test_8_byte_direct, at function
-test_8_byte_direct:
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #8
- mov x2, #8
- bl memcpy
- ldp x29, x30, [sp], #32
- ret
- .size test_8_byte_direct, .-test_8_byte_direct
-
- .globl test_16_byte_direct
- .type test_16_byte_direct, at function
-test_16_byte_direct:
- stp x29, x30, [sp, #-48]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #32
- mov x2, #16
- bl memcpy
- ldp x29, x30, [sp], #48
- ret
- .size test_16_byte_direct, .-test_16_byte_direct
-
- .globl test_32_byte_direct
- .type test_32_byte_direct, at function
-test_32_byte_direct:
- stp x29, x30, [sp, #-80]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #48
- mov x2, #32
- bl memcpy
- ldp x29, x30, [sp], #80
- ret
- .size test_32_byte_direct, .-test_32_byte_direct
-
- .globl test_37_byte_arbitrary
- .type test_37_byte_arbitrary, at function
-test_37_byte_arbitrary:
- stp x29, x30, [sp, #-96]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #56
- mov x2, #37
- bl memcpy
- ldp x29, x30, [sp], #96
- ret
- .size test_37_byte_arbitrary, .-test_37_byte_arbitrary
-
- .globl test_128_byte_too_large
- .type test_128_byte_too_large, at function
-test_128_byte_too_large:
- stp x29, x30, [sp, #-288]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #152
- mov x2, #128
- bl memcpy
- ldp x29, x30, [sp], #288
- ret
- .size test_128_byte_too_large, .-test_128_byte_too_large
-
- .globl main
- .type main, at function
-main:
- stp x29, x30, [sp, #-16]!
- mov x29, sp
-
- bl test_1_byte_direct
- bl test_2_byte_direct
- bl test_4_byte_direct
- bl test_8_byte_direct
- bl test_16_byte_direct
- bl test_32_byte_direct
- bl test_37_byte_arbitrary
- bl test_128_byte_too_large
-
- mov w0, #0
- ldp x29, x30, [sp], #16
- ret
- .size main, .-main
>From e83126edd3dd418086f8341a92609210ba7cb874 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 08:51:18 -0700
Subject: [PATCH 07/26] response to review
---
bolt/lib/Passes/BinaryPasses.cpp | 37 +++--
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 140 ++++--------------
2 files changed, 49 insertions(+), 128 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 0068c1ad0bf1c..e532c2aa0422d 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1866,26 +1866,25 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
const bool IsTailCall = BC.MIB->isTailCall(Inst);
- // Extract the size of thecopy from preceding instructions by looking
- // for writes to the size register
+ // Extract size from preceding instructions (AArch64 only)
+ // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2
std::optional<uint64_t> KnownSize = std::nullopt;
- BitVector WrittenRegs(BC.MRI->getNumRegs());
-
- // Get the size register (3rd arg register, index 2 for AArch64)
- MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
-
- // Look backwards through the basic block for size-setting instr
- for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
- MCInst &Inst = *InstIt;
- WrittenRegs.reset(); // Clear and check what the instruction writes to
- BC.MIB->getWrittenRegs(Inst, WrittenRegs);
-
- // Check for writes to the size register
- if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
- if (std::optional<uint64_t> ExtractedSize =
- BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
- KnownSize = *ExtractedSize;
- break;
+ if (BC.isAArch64()) {
+ BitVector WrittenRegs(BC.MRI->getNumRegs());
+ MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+
+ // Look backwards for size-setting instruction
+ for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
+ MCInst &Inst = *InstIt;
+ WrittenRegs.reset();
+ BC.MIB->getWrittenRegs(Inst, WrittenRegs);
+
+ if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
+ if (std::optional<uint64_t> ExtractedSize =
+ BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
+ KnownSize = *ExtractedSize;
+ break;
+ }
}
}
}
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 03f62117ea096..e640044ec762d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2647,152 +2647,74 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
uint64_t Size) const {
+ // Helper to add load/store pair
+ auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
+ unsigned Reg, unsigned Offset = 0) {
+ Code.emplace_back(MCInstBuilder(LoadOpc)
+ .addReg(Reg)
+ .addReg(AArch64::X1)
+ .addImm(Offset));
+ Code.emplace_back(MCInstBuilder(StoreOpc)
+ .addReg(Reg)
+ .addReg(AArch64::X0)
+ .addImm(Offset));
+ };
+
// Generate optimal instruction sequences based on exact size
switch (Size) {
case 1:
- // Single byte copy
- Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
break;
-
case 2:
- // 2-byte copy using 16-bit load/store
- Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
break;
-
case 4:
- // 4-byte copy using 32-bit load/store
- Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRWui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
break;
-
case 8:
- // 8-byte copy using 64-bit load/store
- Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
- .addReg(AArch64::X3)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRXui)
- .addReg(AArch64::X3)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
break;
-
case 16:
- // 16-byte copy using 128-bit SIMD
- Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
break;
-
case 32:
- // 32-byte copy using two 128-bit SIMD operations
- Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X0)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
- .addReg(AArch64::Q1)
- .addReg(AArch64::X1)
- .addImm(1));
- Code.emplace_back(MCInstBuilder(AArch64::STRQui)
- .addReg(AArch64::Q1)
- .addReg(AArch64::X0)
- .addImm(1));
+ addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
+ addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
break;
default:
if (Size <= 64) {
- // For sizes up to 64 bytes, greedily use the largest possible loads in
- // descending order
+ // For sizes up to 64 bytes, greedily use the largest possible loads
uint64_t Remaining = Size;
uint64_t Offset = 0;
while (Remaining >= 16) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X1)
- .addImm(Offset / 16));
- Code.emplace_back(MCInstBuilder(AArch64::STRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X0)
- .addImm(Offset / 16));
+ addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0,
+ Offset / 16);
Remaining -= 16;
Offset += 16;
}
if (Remaining >= 8) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
- .addReg(AArch64::X3)
- .addReg(AArch64::X1)
- .addImm(Offset / 8));
- Code.emplace_back(MCInstBuilder(AArch64::STRXui)
- .addReg(AArch64::X3)
- .addReg(AArch64::X0)
- .addImm(Offset / 8));
+ addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3,
+ Offset / 8);
Remaining -= 8;
Offset += 8;
}
if (Remaining >= 4) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(Offset / 4));
- Code.emplace_back(MCInstBuilder(AArch64::STRWui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(Offset / 4));
+ addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3,
+ Offset / 4);
Remaining -= 4;
Offset += 4;
}
if (Remaining >= 2) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(Offset / 2));
- Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(Offset / 2));
+ addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3,
+ Offset / 2);
Remaining -= 2;
Offset += 2;
}
if (Remaining == 1) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(Offset));
- Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(Offset));
+ addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
+ Offset);
}
} else {
Code.clear();
>From cf8279a8b5081eec657a1f835c54470653186787 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 03:57:43 -0700
Subject: [PATCH 08/26] Update conditional formatting and move check for size
into binaryPasses
---
bolt/lib/Passes/BinaryPasses.cpp | 5 +++++
bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 13 ++++---------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e532c2aa0422d..1aade44286052 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1889,6 +1889,11 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
}
}
+ if (BC.isAArch64() && !KnownSize.has_value()) {
+ ++II;
+ continue;
+ }
+
const InstructionListType NewCode =
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
II = BB.replaceInstruction(II, NewCode);
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index e640044ec762d..9d30fdface0c5 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2621,24 +2621,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
std::optional<uint64_t> KnownSize) const override {
InstructionListType Code;
if (ReturnEnd) {
- if (KnownSize.has_value() && (*KnownSize >> 12) == 0) {
- // Use immediate if size is known and fits in 12-bit immediate (0-4095)
+ // Use immediate if size fits in 12-bit immediate (0-4095)
+ // Otherwise, fall back to register add for large sizes
+ if ((*KnownSize >> 12) == 0)
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X0)
.addReg(AArch64::X0)
.addImm(*KnownSize)
.addImm(0));
- } else {
- // Fall back to register add for unknown or large sizes
+ else
Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
.addReg(AArch64::X0)
.addReg(AArch64::X0)
.addReg(AArch64::X2));
- }
- }
-
- if (!KnownSize.has_value()) {
- return Code;
}
uint64_t Size = *KnownSize;
>From c317eb0cbd62ac6f164cf44b75d40e082167ce3d Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 04:55:17 -0700
Subject: [PATCH 09/26] Negative Tests (live-in, register move, non-mov
instruction)
---
bolt/test/runtime/AArch64/inline-memcpy.s | 61 ++++++++++++++++++++++-
1 file changed, 60 insertions(+), 1 deletion(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 0e16b6a7e963f..417b444f6a4bb 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,7 +7,7 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls)
# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
@@ -67,6 +67,18 @@
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+# ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain)
+# CHECK-ASM-LABEL: <test_4_byte_add_immediate>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Register move should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_register_move_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Live-in parameter should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_live_in_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
.text
.globl test_1_byte_direct
.type test_1_byte_direct, at function
@@ -172,6 +184,50 @@ test_128_byte_too_large:
ret
.size test_128_byte_too_large, .-test_128_byte_too_large
+ .globl test_4_byte_add_immediate
+ .type test_4_byte_add_immediate, at function
+test_4_byte_add_immediate:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x3, #0
+ add x2, x3, #4
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_4_byte_add_immediate, .-test_4_byte_add_immediate
+
+ .globl test_register_move_negative
+ .type test_register_move_negative, at function
+test_register_move_negative:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x6, #4
+ mov x2, x6
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_register_move_negative, .-test_register_move_negative
+
+ .globl test_live_in_negative
+ .type test_live_in_negative, at function
+test_live_in_negative:
+ # x2 comes in as parameter, no instruction sets it (should NOT inline)
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ # x2 is live-in, no size-setting instruction
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_live_in_negative, .-test_live_in_negative
+
+
+
.globl main
.type main, at function
main:
@@ -186,6 +242,9 @@ main:
bl test_32_byte_direct
bl test_37_byte_arbitrary
bl test_128_byte_too_large
+ bl test_4_byte_add_immediate
+ bl test_register_move_negative
+ bl test_live_in_negative
mov w0, #0
ldp x29, x30, [sp], #16
>From df97d61befcc9ceaf3d82648a1b68b88cc3e0451 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 06:51:08 -0700
Subject: [PATCH 10/26] memcpy8 redundant handling removed
---
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 29 ++++++---------
bolt/test/runtime/AArch64/inline-memcpy.s | 37 ++++++++++++++++++-
2 files changed, 47 insertions(+), 19 deletions(-)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9d30fdface0c5..366d4183bca51 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2620,24 +2620,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
createInlineMemcpy(bool ReturnEnd,
std::optional<uint64_t> KnownSize) const override {
InstructionListType Code;
- if (ReturnEnd) {
- // Use immediate if size fits in 12-bit immediate (0-4095)
- // Otherwise, fall back to register add for large sizes
- if ((*KnownSize >> 12) == 0)
- Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
- .addReg(AArch64::X0)
- .addReg(AArch64::X0)
- .addImm(*KnownSize)
- .addImm(0));
- else
- Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
- .addReg(AArch64::X0)
- .addReg(AArch64::X0)
- .addReg(AArch64::X2));
- }
-
uint64_t Size = *KnownSize;
- return generateSizeSpecificMemcpy(Code, Size);
+
+ // Generate the optimized memcpy sequence
+ generateSizeSpecificMemcpy(Code, Size);
+
+ // If _memcpy8, adjust X0 to return dest+size instead of dest
+ if (ReturnEnd)
+ Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::X0)
+ .addImm(Size)
+ .addImm(0));
+ return Code;
}
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 417b444f6a4bb..961e21f82851d 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
@@ -79,6 +79,13 @@
# CHECK-ASM-LABEL: <test_live_in_negative>:
# CHECK-ASM: bl{{.*}}<memcpy
+# _memcpy8 should be inlined with end-pointer return (dest+size)
+# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: add{{.*}}x0, x0, #0x4
+# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
+
.text
.globl test_1_byte_direct
.type test_1_byte_direct, at function
@@ -226,7 +233,31 @@ test_live_in_negative:
ret
.size test_live_in_negative, .-test_live_in_negative
+ .globl test_memcpy8_4_byte
+ .type test_memcpy8_4_byte, at function
+test_memcpy8_4_byte:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #4
+ bl _memcpy8
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_memcpy8_4_byte, .-test_memcpy8_4_byte
+ # Simple _memcpy8 implementation that calls memcpy and returns dest+size
+ .globl _memcpy8
+ .type _memcpy8, at function
+_memcpy8:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ mov x3, x0
+ bl memcpy
+ add x0, x3, x2
+ ldp x29, x30, [sp], #16
+ ret
+ .size _memcpy8, .-_memcpy8
.globl main
.type main, at function
@@ -245,6 +276,8 @@ main:
bl test_4_byte_add_immediate
bl test_register_move_negative
bl test_live_in_negative
+ bl test_memcpy8_4_byte
+ bl test_memcpy8_large_size
mov w0, #0
ldp x29, x30, [sp], #16
>From 25cfb58b165fd1190f9b1b52cce1423d2db5d3c1 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 06:54:14 -0700
Subject: [PATCH 11/26] nit: comment clean up
---
bolt/lib/Passes/BinaryPasses.cpp | 6 +++---
bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 9 ++++-----
2 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 1aade44286052..e8124dd3cb4f4 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1866,14 +1866,14 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
const bool IsTailCall = BC.MIB->isTailCall(Inst);
- // Extract size from preceding instructions (AArch64 only)
- // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2
+ // Extract size from preceding instructions (AArch64 only).
+ // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
std::optional<uint64_t> KnownSize = std::nullopt;
if (BC.isAArch64()) {
BitVector WrittenRegs(BC.MRI->getNumRegs());
MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
- // Look backwards for size-setting instruction
+ // Look backwards for size-setting instruction.
for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
MCInst &Inst = *InstIt;
WrittenRegs.reset();
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 366d4183bca51..67febc2324e14 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2622,10 +2622,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType Code;
uint64_t Size = *KnownSize;
- // Generate the optimized memcpy sequence
+ // Generate the optimized memcpy sequence.
generateSizeSpecificMemcpy(Code, Size);
- // If _memcpy8, adjust X0 to return dest+size instead of dest
+ // If _memcpy8, adjust X0 to return dest+size instead of dest.
if (ReturnEnd)
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X0)
@@ -2637,7 +2637,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
uint64_t Size) const {
- // Helper to add load/store pair
auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
unsigned Reg, unsigned Offset = 0) {
Code.emplace_back(MCInstBuilder(LoadOpc)
@@ -2650,7 +2649,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
.addImm(Offset));
};
- // Generate optimal instruction sequences based on exact size
+ // Generate optimal instruction sequences based on exact size.
switch (Size) {
case 1:
addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
@@ -2674,7 +2673,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
default:
if (Size <= 64) {
- // For sizes up to 64 bytes, greedily use the largest possible loads
+ // For sizes up to 64 bytes, greedily use the largest possible loads.
uint64_t Remaining = Size;
uint64_t Offset = 0;
>From e308855758965504cca82484f66065d186c64093 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 02:12:27 -0700
Subject: [PATCH 12/26] minor refactor
---
bolt/lib/Passes/BinaryPasses.cpp | 11 +++++-----
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 22 +++++++------------
2 files changed, 13 insertions(+), 20 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e8124dd3cb4f4..022d06ae80e7b 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1872,6 +1872,7 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
if (BC.isAArch64()) {
BitVector WrittenRegs(BC.MRI->getNumRegs());
MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+ std::optional<uint64_t> ExtractedSize;
// Look backwards for size-setting instruction.
for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
@@ -1879,12 +1880,10 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
WrittenRegs.reset();
BC.MIB->getWrittenRegs(Inst, WrittenRegs);
- if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
- if (std::optional<uint64_t> ExtractedSize =
- BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
- KnownSize = *ExtractedSize;
- break;
- }
+ if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] &&
+ (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) {
+ KnownSize = *ExtractedSize;
+ break;
}
}
}
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 67febc2324e14..dfb5fe3cfe30d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2599,20 +2599,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
}
InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
- // Fallback
return createInlineMemcpy(ReturnEnd, std::nullopt);
}
std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
- if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) {
- if (Inst.getOperand(0).isReg() &&
- Inst.getOperand(0).getReg() == TargetReg &&
- Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
- Inst.getOperand(2).getImm() == 0) {
- return Inst.getOperand(1).getImm();
- }
- }
+ if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 &&
+ Inst.getOperand(0).isReg() &&
+ Inst.getOperand(0).getReg() == TargetReg &&
+ Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
+ Inst.getOperand(2).getImm() == 0)
+ return Inst.getOperand(1).getImm();
return std::nullopt;
}
@@ -2622,7 +2619,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType Code;
uint64_t Size = *KnownSize;
- // Generate the optimized memcpy sequence.
generateSizeSpecificMemcpy(Code, Size);
// If _memcpy8, adjust X0 to return dest+size instead of dest.
@@ -2701,13 +2697,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
Remaining -= 2;
Offset += 2;
}
- if (Remaining == 1) {
+ if (Remaining == 1)
addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
Offset);
- }
- } else {
+ } else
Code.clear();
- }
break;
}
return Code;
>From 365a0bfaa0d68e9a5c45f9b5163af49ca6d5c1b8 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 06:33:39 -0700
Subject: [PATCH 13/26] NFC: Post-review refactor
---
bolt/include/bolt/Core/MCPlusBuilder.h | 10 +++
bolt/lib/Passes/BinaryPasses.cpp | 21 +----
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 81 ++++++++++---------
3 files changed, 55 insertions(+), 57 deletions(-)
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 6cbf288f3b8f4..3192472f5fbe0 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -14,6 +14,7 @@
#ifndef BOLT_CORE_MCPLUSBUILDER_H
#define BOLT_CORE_MCPLUSBUILDER_H
+#include "bolt/Core/BinaryBasicBlock.h"
#include "bolt/Core/MCPlus.h"
#include "bolt/Core/Relocation.h"
#include "llvm/ADT/ArrayRef.h"
@@ -1888,6 +1889,15 @@ class MCPlusBuilder {
return {};
}
+ /// Find memcpy size in bytes by using preceding instructions.
+ /// Returns std::nullopt if size cannot be determined (no-op for most
+ /// targets).
+ virtual std::optional<uint64_t>
+ findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+ BinaryBasicBlock::iterator CallInst) const {
+ return std::nullopt;
+ }
+
/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
/// (dest + n) instead of dest.
virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 022d06ae80e7b..f1807f6eb997e 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1868,25 +1868,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
// Extract size from preceding instructions (AArch64 only).
// Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
- std::optional<uint64_t> KnownSize = std::nullopt;
- if (BC.isAArch64()) {
- BitVector WrittenRegs(BC.MRI->getNumRegs());
- MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
- std::optional<uint64_t> ExtractedSize;
-
- // Look backwards for size-setting instruction.
- for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
- MCInst &Inst = *InstIt;
- WrittenRegs.reset();
- BC.MIB->getWrittenRegs(Inst, WrittenRegs);
-
- if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] &&
- (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) {
- KnownSize = *ExtractedSize;
- break;
- }
- }
- }
+ std::optional<uint64_t> KnownSize =
+ BC.MIB->findMemcpySizeInBytes(BB, II);
if (BC.isAArch64() && !KnownSize.has_value()) {
++II;
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index dfb5fe3cfe30d..6f539b8588f2e 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2604,15 +2604,33 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
- if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 &&
- Inst.getOperand(0).isReg() &&
+ // Match MOVZXi with the target register and no shift.
+ if (Inst.getOpcode() == AArch64::MOVZXi &&
Inst.getOperand(0).getReg() == TargetReg &&
- Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
Inst.getOperand(2).getImm() == 0)
return Inst.getOperand(1).getImm();
return std::nullopt;
}
+ std::optional<uint64_t>
+ findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+ BinaryBasicBlock::iterator CallInst) const override {
+ BitVector WrittenRegs(RegInfo->getNumRegs());
+ MCPhysReg SizeReg = getIntArgRegister(2);
+ std::optional<uint64_t> ExtractedSize;
+
+ for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
+ const MCInst &Inst = *InstIt;
+ WrittenRegs.reset();
+ getWrittenRegs(Inst, WrittenRegs);
+
+ if (SizeReg != getNoRegister() && WrittenRegs[SizeReg] &&
+ (ExtractedSize = extractMoveImmediate(Inst, SizeReg)))
+ return *ExtractedSize;
+ }
+ return std::nullopt;
+ }
+
InstructionListType
createInlineMemcpy(bool ReturnEnd,
std::optional<uint64_t> KnownSize) const override {
@@ -2633,7 +2651,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
uint64_t Size) const {
- auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
+ auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
unsigned Reg, unsigned Offset = 0) {
Code.emplace_back(MCInstBuilder(LoadOpc)
.addReg(Reg)
@@ -2648,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
// Generate optimal instruction sequences based on exact size.
switch (Size) {
case 1:
- addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
break;
case 2:
- addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
break;
case 4:
- addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
break;
case 8:
- addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
+ AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
break;
case 16:
- addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
break;
case 32:
- addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
- addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
break;
default:
@@ -2673,33 +2691,20 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
uint64_t Remaining = Size;
uint64_t Offset = 0;
- while (Remaining >= 16) {
- addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0,
- Offset / 16);
- Remaining -= 16;
- Offset += 16;
- }
- if (Remaining >= 8) {
- addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3,
- Offset / 8);
- Remaining -= 8;
- Offset += 8;
- }
- if (Remaining >= 4) {
- addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3,
- Offset / 4);
- Remaining -= 4;
- Offset += 4;
- }
- if (Remaining >= 2) {
- addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3,
- Offset / 2);
- Remaining -= 2;
- Offset += 2;
- }
- if (Remaining == 1)
- addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
- Offset);
+ const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
+ LoadStoreOps = {
+ {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0},
+ {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3},
+ {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3},
+ {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3},
+ {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}};
+
+ for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
+ while (Remaining >= OpSize) {
+ AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
+ Remaining -= OpSize;
+ Offset += OpSize;
+ }
} else
Code.clear();
break;
>From 84c904ac68b263b48227b3308ad16c795382b7c3 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 06:42:47 -0700
Subject: [PATCH 14/26] NFC: Test for corner case with size 0
---
bolt/test/runtime/AArch64/inline-memcpy.s | 25 ++++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 961e21f82851d..3acb5e394d52d 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
@@ -62,6 +62,12 @@
# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
+# CHECK-ASM-LABEL: <test_0_byte>:
+# CHECK-ASM-NOT: ldr
+# CHECK-ASM-NOT: str
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
# CHECK-ASM-LABEL: <test_128_byte_too_large>:
# CHECK-ASM-NOT: bl{{.*}}<memcpy
@@ -178,6 +184,19 @@ test_37_byte_arbitrary:
ret
.size test_37_byte_arbitrary, .-test_37_byte_arbitrary
+ .globl test_0_byte
+ .type test_0_byte, at function
+test_0_byte:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #0
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_0_byte, .-test_0_byte
+
.globl test_128_byte_too_large
.type test_128_byte_too_large, at function
test_128_byte_too_large:
@@ -272,12 +291,12 @@ main:
bl test_16_byte_direct
bl test_32_byte_direct
bl test_37_byte_arbitrary
+ bl test_0_byte
bl test_128_byte_too_large
bl test_4_byte_add_immediate
bl test_register_move_negative
bl test_live_in_negative
bl test_memcpy8_4_byte
- bl test_memcpy8_large_size
mov w0, #0
ldp x29, x30, [sp], #16
>From 0561bccf755709811eed3d13e10bdcd2afa5fbe3 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:01:21 -0700
Subject: [PATCH 15/26] Use temp instead of argument registers
---
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 24 +++++++++----------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 6f539b8588f2e..f17a91bc3ba76 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2666,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
// Generate optimal instruction sequences based on exact size.
switch (Size) {
case 1:
- AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
break;
case 2:
- AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
break;
case 4:
- AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
break;
case 8:
- AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
+ AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
break;
case 16:
- AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
break;
case 32:
- AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
- AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
break;
default:
@@ -2693,11 +2693,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
LoadStoreOps = {
- {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0},
- {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3},
- {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3},
- {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3},
- {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}};
+ {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
+ {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
+ {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
+ {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
+ {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
while (Remaining >= OpSize) {
>From cc49db79eea544305571e5e91caa3328c91cf4a7 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:01:54 -0700
Subject: [PATCH 16/26] Update early return
---
bolt/lib/Passes/BinaryPasses.cpp | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index f1807f6eb997e..d40f5fb78c7f3 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1871,10 +1871,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
std::optional<uint64_t> KnownSize =
BC.MIB->findMemcpySizeInBytes(BB, II);
- if (BC.isAArch64() && !KnownSize.has_value()) {
- ++II;
+ if (BC.isAArch64() && !KnownSize.has_value())
continue;
- }
const InstructionListType NewCode =
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
>From 115606be208c8b6675df59b9f231dd709ea863fd Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:02:48 -0700
Subject: [PATCH 17/26] Update tests to be more specific about registers +
negative test on early return check
---
bolt/test/runtime/AArch64/inline-memcpy.s | 70 +++++++++++++++--------
1 file changed, 45 insertions(+), 25 deletions(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 3acb5e394d52d..14a95d91dd189 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,59 +7,59 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls)
+# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 14 total calls)
# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
# 1-byte copy should use single byte load/store (ldrb/strb)
# CHECK-ASM-LABEL: <test_1_byte_direct>:
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldrb{{.*}}w9, [x1]
+# CHECK-ASM: strb{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 2-byte copy should use single 16-bit load/store (ldrh/strh)
# CHECK-ASM-LABEL: <test_2_byte_direct>:
-# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldrh{{.*}}w9, [x1]
+# CHECK-ASM: strh{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 4-byte copy should use single 32-bit load/store (w register)
# CHECK-ASM-LABEL: <test_4_byte_direct>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM: str{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 8-byte copy should use single 64-bit load/store (x register)
# CHECK-ASM-LABEL: <test_8_byte_direct>:
-# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}x9, [x1]
+# CHECK-ASM: str{{.*}}x9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 16-byte copy should use single 128-bit SIMD load/store (q register)
# CHECK-ASM-LABEL: <test_16_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 32-byte copy should use two 128-bit SIMD operations
# CHECK-ASM-LABEL: <test_32_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q17, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q17, [x0, #0x10]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w9, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w9, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w9, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w9, [x0, #0x24]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
@@ -85,10 +85,14 @@
# CHECK-ASM-LABEL: <test_live_in_negative>:
# CHECK-ASM: bl{{.*}}<memcpy
+# Register-based size should NOT be inlined (isAArch64 & size unknown at compile time)
+# CHECK-ASM-LABEL: <test_register_size_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
# _memcpy8 should be inlined with end-pointer return (dest+size)
# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM: str{{.*}}w9, [x0]
# CHECK-ASM: add{{.*}}x0, x0, #0x4
# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
@@ -252,6 +256,21 @@ test_live_in_negative:
ret
.size test_live_in_negative, .-test_live_in_negative
+ .globl test_register_size_negative
+ .type test_register_size_negative, at function
+test_register_size_negative:
+ # This would crash without isAArch64() check: size from register parameter
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x3, #4
+ mov x2, x3
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_register_size_negative, .-test_register_size_negative
+
.globl test_memcpy8_4_byte
.type test_memcpy8_4_byte, at function
test_memcpy8_4_byte:
@@ -296,6 +315,7 @@ main:
bl test_4_byte_add_immediate
bl test_register_move_negative
bl test_live_in_negative
+ bl test_register_size_negative
bl test_memcpy8_4_byte
mov w0, #0
>From 1986bfac3fcfdd3b8036096c72d7f1ed03fea1bc Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 29 Aug 2025 08:03:58 -0700
Subject: [PATCH 18/26] Complex test + register aliasing
---
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 18 +--
bolt/test/runtime/AArch64/inline-memcpy.s | 107 +++++++++++++++++-
2 files changed, 113 insertions(+), 12 deletions(-)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index f17a91bc3ba76..12e226a00e26d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2604,10 +2604,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
- // Match MOVZXi with the target register and no shift.
- if (Inst.getOpcode() == AArch64::MOVZXi &&
- Inst.getOperand(0).getReg() == TargetReg &&
- Inst.getOperand(2).getImm() == 0)
+ // Match MOVZ instructions (both X and W register variants) with no shift.
+ if ((Inst.getOpcode() == AArch64::MOVZXi ||
+ Inst.getOpcode() == AArch64::MOVZWi) &&
+ Inst.getOperand(2).getImm() == 0 &&
+ getAliases(TargetReg)[Inst.getOperand(0).getReg()])
return Inst.getOperand(1).getImm();
return std::nullopt;
}
@@ -2617,16 +2618,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
BinaryBasicBlock::iterator CallInst) const override {
BitVector WrittenRegs(RegInfo->getNumRegs());
MCPhysReg SizeReg = getIntArgRegister(2);
- std::optional<uint64_t> ExtractedSize;
+ const BitVector &SizeRegAliases = getAliases(SizeReg);
for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
const MCInst &Inst = *InstIt;
WrittenRegs.reset();
getWrittenRegs(Inst, WrittenRegs);
- if (SizeReg != getNoRegister() && WrittenRegs[SizeReg] &&
- (ExtractedSize = extractMoveImmediate(Inst, SizeReg)))
- return *ExtractedSize;
+ if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases)) {
+ if (auto ExtractedSize = extractMoveImmediate(Inst, SizeReg))
+ return *ExtractedSize;
+ }
}
return std::nullopt;
}
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 14a95d91dd189..eb6851bbe7e0b 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 14 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (12 successful inlines out of 16 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 12 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
@@ -96,6 +96,24 @@
# CHECK-ASM: add{{.*}}x0, x0, #0x4
# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
+# Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register
+# CHECK-ASM-LABEL: <complex_operation>:
+# CHECK-ASM: ldr{{.*}}x9, [x1]
+# CHECK-ASM: str{{.*}}x9, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# Complex function with caller-saved Q16/Q17 should inline 64-byte memcpy using Q16 as temp register
+# CHECK-ASM-LABEL: <complex_fp_operation>:
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x20]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x20]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x30]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x30]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
.text
.globl test_1_byte_direct
.type test_1_byte_direct, at function
@@ -297,10 +315,80 @@ _memcpy8:
ret
.size _memcpy8, .-_memcpy8
+ .globl complex_operation
+ .type complex_operation, at function
+complex_operation:
+ stp x29, x30, [sp, #-32]!
+ str x19, [sp, #16]
+ mov x29, sp
+ ldp x9, x10, [x0]
+ ldp x11, x12, [x0, #16]
+ mov x19, x1
+ mov x8, x0
+ add x0, x1, #32
+ madd x9, x9, x2, x3
+ and x10, x10, x4
+ asr x12, x12, #2
+ mov w2, #8
+ orr x11, x12, x11, lsl #3
+ eor x12, x9, x10
+ mul x10, x11, x10
+ eor x12, x12, x11
+ add x13, x12, x9
+ add x9, x11, x9, asr #4
+ stp x13, x10, [x1]
+ mov w10, w12
+ stp x9, x10, [x1, #16]
+ add x1, x8, #32
+ bl memcpy
+ ldr x0, [x19, #16]
+ ldr x19, [sp, #16]
+ ldp x29, x30, [sp], #32
+ b use
+ .size complex_operation, .-complex_operation
+
+ .globl use
+ .type use, at function
+use:
+ ret
+ .size use, .-use
+
+# Same as above but using FP caller-saved registers (Q16/17)
+ .globl complex_fp_operation
+ .type complex_fp_operation, at function
+complex_fp_operation:
+ stp x29, x30, [sp, #-48]!
+ stp q8, q9, [sp, #16]
+ mov x29, sp
+ ldr q16, [x0]
+ ldr q17, [x0, #16]
+ mov x8, x0
+ add x0, x1, #32
+ fadd v16.4s, v16.4s, v17.4s
+ fmul v17.4s, v16.4s, v17.4s
+ fsub v16.2d, v16.2d, v17.2d
+ mov w2, #64
+ fmax v17.4s, v16.4s, v17.4s
+ fmin v16.2d, v16.2d, v17.2d
+ str q16, [x1]
+ str q17, [x1, #16]
+ add x1, x8, #32
+ bl memcpy
+ ldp q8, q9, [sp, #16]
+ ldp x29, x30, [sp], #48
+ b use_fp
+ .size complex_fp_operation, .-complex_fp_operation
+
+ .globl use_fp
+ .type use_fp, at function
+use_fp:
+ ret
+ .size use_fp, .-use_fp
+
.globl main
.type main, at function
main:
- stp x29, x30, [sp, #-16]!
+ stp x29, x30, [sp, #-208]!
mov x29, sp
bl test_1_byte_direct
@@ -318,7 +406,18 @@ main:
bl test_register_size_negative
bl test_memcpy8_4_byte
+ add x0, sp, #32
+ add x1, sp, #96
+ mov x2, #10
+ mov x3, #20
+ mov x4, #0xFF
+ bl complex_operation
+
+ add x0, sp, #160
+ add x1, sp, #96
+ bl complex_fp_operation
+
mov w0, #0
- ldp x29, x30, [sp], #16
+ ldp x29, x30, [sp], #208
ret
.size main, .-main
>From bd990ea7582ee01e5872014d05470d9fafdfea2c Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Mon, 1 Sep 2025 01:40:32 -0700
Subject: [PATCH 19/26] NFC use if initializer
---
bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 12e226a00e26d..707856b5874ea 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2625,10 +2625,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
WrittenRegs.reset();
getWrittenRegs(Inst, WrittenRegs);
- if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases)) {
- if (auto ExtractedSize = extractMoveImmediate(Inst, SizeReg))
- return *ExtractedSize;
- }
+ if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases);
+ auto ExtractedSize = extractMoveImmediate(Inst, SizeReg))
+ return *ExtractedSize;
}
return std::nullopt;
}
>From ee5f859f26eb3272934ff03cef8bcb52ab772e89 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 09:07:33 -0700
Subject: [PATCH 20/26] [style] trailing whitespaces removed
---
bolt/test/runtime/AArch64/inline-memcpy.s | 52 +++++++++++------------
1 file changed, 26 insertions(+), 26 deletions(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index eb6851bbe7e0b..0bcb7514afad3 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -3,7 +3,7 @@
# REQUIRES: system-linux, aarch64-registered-target
# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
-# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
+# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
@@ -65,7 +65,7 @@
# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
# CHECK-ASM-LABEL: <test_0_byte>:
# CHECK-ASM-NOT: ldr
-# CHECK-ASM-NOT: str
+# CHECK-ASM-NOT: str
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
@@ -115,52 +115,52 @@
# CHECK-ASM-NOT: bl{{.*}}<memcpy
.text
- .globl test_1_byte_direct
+ .globl test_1_byte_direct
.type test_1_byte_direct, at function
-test_1_byte_direct:
- stp x29, x30, [sp, #-32]!
+test_1_byte_direct:
+ stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
- add x0, sp, #8
+ add x0, sp, #8
mov x2, #1
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_1_byte_direct, .-test_1_byte_direct
- .globl test_2_byte_direct
+ .globl test_2_byte_direct
.type test_2_byte_direct, at function
-test_2_byte_direct:
- stp x29, x30, [sp, #-32]!
+test_2_byte_direct:
+ stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
- add x0, sp, #8
+ add x0, sp, #8
mov x2, #2
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_2_byte_direct, .-test_2_byte_direct
- .globl test_4_byte_direct
+ .globl test_4_byte_direct
.type test_4_byte_direct, at function
-test_4_byte_direct:
- stp x29, x30, [sp, #-32]!
+test_4_byte_direct:
+ stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
- add x0, sp, #8
+ add x0, sp, #8
mov x2, #4
bl memcpy
ldp x29, x30, [sp], #32
ret
.size test_4_byte_direct, .-test_4_byte_direct
- .globl test_8_byte_direct
+ .globl test_8_byte_direct
.type test_8_byte_direct, at function
-test_8_byte_direct:
- stp x29, x30, [sp, #-32]!
+test_8_byte_direct:
+ stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
- add x0, sp, #8
+ add x0, sp, #8
mov x2, #8
bl memcpy
ldp x29, x30, [sp], #32
@@ -185,7 +185,7 @@ test_16_byte_direct:
test_32_byte_direct:
stp x29, x30, [sp, #-80]!
mov x29, sp
- add x1, sp, #16
+ add x1, sp, #16
add x0, sp, #48
mov x2, #32
bl memcpy
@@ -198,7 +198,7 @@ test_32_byte_direct:
test_37_byte_arbitrary:
stp x29, x30, [sp, #-96]!
mov x29, sp
- add x1, sp, #16
+ add x1, sp, #16
add x0, sp, #56
mov x2, #37
bl memcpy
@@ -224,7 +224,7 @@ test_0_byte:
test_128_byte_too_large:
stp x29, x30, [sp, #-288]!
mov x29, sp
- add x1, sp, #16
+ add x1, sp, #16
add x0, sp, #152
mov x2, #128
bl memcpy
@@ -390,12 +390,12 @@ use_fp:
main:
stp x29, x30, [sp, #-208]!
mov x29, sp
-
+
bl test_1_byte_direct
bl test_2_byte_direct
bl test_4_byte_direct
bl test_8_byte_direct
- bl test_16_byte_direct
+ bl test_16_byte_direct
bl test_32_byte_direct
bl test_37_byte_arbitrary
bl test_0_byte
@@ -405,18 +405,18 @@ main:
bl test_live_in_negative
bl test_register_size_negative
bl test_memcpy8_4_byte
-
+
add x0, sp, #32
add x1, sp, #96
mov x2, #10
mov x3, #20
mov x4, #0xFF
bl complex_operation
-
+
add x0, sp, #160
add x1, sp, #96
bl complex_fp_operation
-
+
mov w0, #0
ldp x29, x30, [sp], #208
ret
>From ad503a791330dd6072a89ebbd73eac71829629c4 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 09:18:07 -0700
Subject: [PATCH 21/26] [test] CHECK-NEXT used
---
bolt/test/runtime/AArch64/inline-memcpy.s | 50 +++++++++++------------
1 file changed, 25 insertions(+), 25 deletions(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 0bcb7514afad3..3222935b74fef 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -15,51 +15,51 @@
# 1-byte copy should use single byte load/store (ldrb/strb)
# CHECK-ASM-LABEL: <test_1_byte_direct>:
# CHECK-ASM: ldrb{{.*}}w9, [x1]
-# CHECK-ASM: strb{{.*}}w9, [x0]
+# CHECK-ASM-NEXT: strb{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 2-byte copy should use single 16-bit load/store (ldrh/strh)
# CHECK-ASM-LABEL: <test_2_byte_direct>:
# CHECK-ASM: ldrh{{.*}}w9, [x1]
-# CHECK-ASM: strh{{.*}}w9, [x0]
+# CHECK-ASM-NEXT: strh{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 4-byte copy should use single 32-bit load/store (w register)
# CHECK-ASM-LABEL: <test_4_byte_direct>:
# CHECK-ASM: ldr{{.*}}w9, [x1]
-# CHECK-ASM: str{{.*}}w9, [x0]
+# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 8-byte copy should use single 64-bit load/store (x register)
# CHECK-ASM-LABEL: <test_8_byte_direct>:
# CHECK-ASM: ldr{{.*}}x9, [x1]
-# CHECK-ASM: str{{.*}}x9, [x0]
+# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 16-byte copy should use single 128-bit SIMD load/store (q register)
# CHECK-ASM-LABEL: <test_16_byte_direct>:
# CHECK-ASM: ldr{{.*}}q16, [x1]
-# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 32-byte copy should use two 128-bit SIMD operations
# CHECK-ASM-LABEL: <test_32_byte_direct>:
# CHECK-ASM: ldr{{.*}}q16, [x1]
-# CHECK-ASM: str{{.*}}q16, [x0]
-# CHECK-ASM: ldr{{.*}}q17, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q17, [x0, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: ldr{{.*}}q17, [x1, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q17, [x0, #0x10]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
# CHECK-ASM: ldr{{.*}}q16, [x1]
-# CHECK-ASM: str{{.*}}q16, [x0]
-# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w9, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w9, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w9, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w9, [x0, #0x24]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM-NEXT: ldr{{.*}}w9, [x1, #0x20]
+# CHECK-ASM-NEXT: str{{.*}}w9, [x0, #0x20]
+# CHECK-ASM-NEXT: ldrb{{.*}}w9, [x1, #0x24]
+# CHECK-ASM-NEXT: strb{{.*}}w9, [x0, #0x24]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
@@ -92,26 +92,26 @@
# _memcpy8 should be inlined with end-pointer return (dest+size)
# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
# CHECK-ASM: ldr{{.*}}w9, [x1]
-# CHECK-ASM: str{{.*}}w9, [x0]
-# CHECK-ASM: add{{.*}}x0, x0, #0x4
+# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
+# CHECK-ASM-NEXT: add{{.*}}x0, x0, #0x4
# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
# Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register
# CHECK-ASM-LABEL: <complex_operation>:
# CHECK-ASM: ldr{{.*}}x9, [x1]
-# CHECK-ASM: str{{.*}}x9, [x0]
+# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# Complex function with caller-saved Q16/Q17 should inline 64-byte memcpy using Q16 as temp register
# CHECK-ASM-LABEL: <complex_fp_operation>:
# CHECK-ASM: ldr{{.*}}q16, [x1]
-# CHECK-ASM: str{{.*}}q16, [x0]
-# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}q16, [x1, #0x20]
-# CHECK-ASM: str{{.*}}q16, [x0, #0x20]
-# CHECK-ASM: ldr{{.*}}q16, [x1, #0x30]
-# CHECK-ASM: str{{.*}}q16, [x0, #0x30]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x20]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x20]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x30]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x30]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
.text
>From 267432aeba503799df057914112ae3450a53fc9b Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 09:58:09 -0700
Subject: [PATCH 22/26] [test] updated negative test to check for negative size
---
bolt/test/runtime/AArch64/inline-memcpy.s | 54 ++++-------------------
1 file changed, 8 insertions(+), 46 deletions(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 3222935b74fef..ee934bc50dbd5 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -85,8 +85,8 @@
# CHECK-ASM-LABEL: <test_live_in_negative>:
# CHECK-ASM: bl{{.*}}<memcpy
-# Register-based size should NOT be inlined (isAArch64 & size unknown at compile time)
-# CHECK-ASM-LABEL: <test_register_size_negative>:
+# Negative size should NOT be inlined (invalid size parameter)
+# CHECK-ASM-LABEL: <test_negative_size>:
# CHECK-ASM: bl{{.*}}<memcpy
# _memcpy8 should be inlined with end-pointer return (dest+size)
@@ -274,20 +274,19 @@ test_live_in_negative:
ret
.size test_live_in_negative, .-test_live_in_negative
- .globl test_register_size_negative
- .type test_register_size_negative, at function
-test_register_size_negative:
- # This would crash without isAArch64() check: size from register parameter
+ .globl test_negative_size
+ .type test_negative_size, at function
+test_negative_size:
+ # Negative size should not be inlined
stp x29, x30, [sp, #-32]!
mov x29, sp
add x1, sp, #16
add x0, sp, #8
- mov x3, #4
- mov x2, x3
+ mov x2, #-1
bl memcpy
ldp x29, x30, [sp], #32
ret
- .size test_register_size_negative, .-test_register_size_negative
+ .size test_negative_size, .-test_negative_size
.globl test_memcpy8_4_byte
.type test_memcpy8_4_byte, at function
@@ -384,40 +383,3 @@ complex_fp_operation:
use_fp:
ret
.size use_fp, .-use_fp
-
- .globl main
- .type main, at function
-main:
- stp x29, x30, [sp, #-208]!
- mov x29, sp
-
- bl test_1_byte_direct
- bl test_2_byte_direct
- bl test_4_byte_direct
- bl test_8_byte_direct
- bl test_16_byte_direct
- bl test_32_byte_direct
- bl test_37_byte_arbitrary
- bl test_0_byte
- bl test_128_byte_too_large
- bl test_4_byte_add_immediate
- bl test_register_move_negative
- bl test_live_in_negative
- bl test_register_size_negative
- bl test_memcpy8_4_byte
-
- add x0, sp, #32
- add x1, sp, #96
- mov x2, #10
- mov x3, #20
- mov x4, #0xFF
- bl complex_operation
-
- add x0, sp, #160
- add x1, sp, #96
- bl complex_fp_operation
-
- mov w0, #0
- ldp x29, x30, [sp], #208
- ret
- .size main, .-main
>From 198744d11278c5ec1134252cdccd8bc77ee3380d Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 10:04:51 -0700
Subject: [PATCH 23/26] [nfc] minor refactor
---
bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 707856b5874ea..9e1cec4c14a93 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2598,10 +2598,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
return 4;
}
- InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
- return createInlineMemcpy(ReturnEnd, std::nullopt);
- }
-
std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
// Match MOVZ instructions (both X and W register variants) with no shift.
@@ -2616,8 +2612,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
std::optional<uint64_t>
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
BinaryBasicBlock::iterator CallInst) const override {
- BitVector WrittenRegs(RegInfo->getNumRegs());
MCPhysReg SizeReg = getIntArgRegister(2);
+ if (SizeReg == getNoRegister())
+ return std::nullopt;
+
+ BitVector WrittenRegs(RegInfo->getNumRegs());
const BitVector &SizeRegAliases = getAliases(SizeReg);
for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
@@ -2625,9 +2624,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
WrittenRegs.reset();
getWrittenRegs(Inst, WrittenRegs);
- if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases);
- auto ExtractedSize = extractMoveImmediate(Inst, SizeReg))
- return *ExtractedSize;
+ if (WrittenRegs.anyCommon(SizeRegAliases))
+ return extractMoveImmediate(Inst, SizeReg);
}
return std::nullopt;
}
@@ -2635,6 +2633,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType
createInlineMemcpy(bool ReturnEnd,
std::optional<uint64_t> KnownSize) const override {
+ assert(KnownSize.has_value() &&
+ "AArch64 memcpy inlining requires known size");
InstructionListType Code;
uint64_t Size = *KnownSize;
>From 62b871ec4204cd629e2a59e6f07f291c009c0f0a Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 10:16:34 -0700
Subject: [PATCH 24/26] [bug] memcpy call removed for sizes>64
---
bolt/lib/Passes/BinaryPasses.cpp | 2 +-
bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 3 +--
bolt/test/runtime/AArch64/inline-memcpy.s | 9 ++++-----
3 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d40f5fb78c7f3..2f1bb21bc1fd8 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1871,7 +1871,7 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
std::optional<uint64_t> KnownSize =
BC.MIB->findMemcpySizeInBytes(BB, II);
- if (BC.isAArch64() && !KnownSize.has_value())
+ if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64))
continue;
const InstructionListType NewCode =
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9e1cec4c14a93..bcc9809b52fab 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2706,8 +2706,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
Remaining -= OpSize;
Offset += OpSize;
}
- } else
- Code.clear();
+ }
break;
}
return Code;
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index ee934bc50dbd5..e0072f38db2d2 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (12 successful inlines out of 16 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 12 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
@@ -68,10 +68,9 @@
# CHECK-ASM-NOT: str
# CHECK-ASM-NOT: bl{{.*}}<memcpy
-# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# 128-byte copy should NOT be inlined (too large, original call preserved)
# CHECK-ASM-LABEL: <test_128_byte_too_large>:
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+# CHECK-ASM: bl{{.*}}<memcpy
# ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain)
# CHECK-ASM-LABEL: <test_4_byte_add_immediate>:
>From dcab6acd61085456c885d0d8f76d99138829d25e Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 5 Sep 2025 09:16:48 -0700
Subject: [PATCH 25/26] [nfc][test] reordered test
---
bolt/test/runtime/AArch64/inline-memcpy.s | 36 +++++++++++------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index e0072f38db2d2..dc59a08b889a7 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -68,6 +68,10 @@
# CHECK-ASM-NOT: str
# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# Negative size should NOT be inlined (invalid size parameter)
+# CHECK-ASM-LABEL: <test_negative_size>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
# 128-byte copy should NOT be inlined (too large, original call preserved)
# CHECK-ASM-LABEL: <test_128_byte_too_large>:
# CHECK-ASM: bl{{.*}}<memcpy
@@ -84,10 +88,6 @@
# CHECK-ASM-LABEL: <test_live_in_negative>:
# CHECK-ASM: bl{{.*}}<memcpy
-# Negative size should NOT be inlined (invalid size parameter)
-# CHECK-ASM-LABEL: <test_negative_size>:
-# CHECK-ASM: bl{{.*}}<memcpy
-
# _memcpy8 should be inlined with end-pointer return (dest+size)
# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
# CHECK-ASM: ldr{{.*}}w9, [x1]
@@ -218,6 +218,20 @@ test_0_byte:
ret
.size test_0_byte, .-test_0_byte
+ .globl test_negative_size
+ .type test_negative_size, at function
+test_negative_size:
+ # Negative size should not be inlined
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #-1
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_negative_size, .-test_negative_size
+
.globl test_128_byte_too_large
.type test_128_byte_too_large, at function
test_128_byte_too_large:
@@ -273,20 +287,6 @@ test_live_in_negative:
ret
.size test_live_in_negative, .-test_live_in_negative
- .globl test_negative_size
- .type test_negative_size, at function
-test_negative_size:
- # Negative size should not be inlined
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #8
- mov x2, #-1
- bl memcpy
- ldp x29, x30, [sp], #32
- ret
- .size test_negative_size, .-test_negative_size
-
.globl test_memcpy8_4_byte
.type test_memcpy8_4_byte, at function
test_memcpy8_4_byte:
>From 875156e6bf82cb3e9ba27df0bf541374350ff69e Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 5 Sep 2025 09:18:20 -0700
Subject: [PATCH 26/26] [nfc] added assert for default case (future-proofing
for changes to BinaryPasses.cpp)
---
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 42 ++++++++++---------
1 file changed, 22 insertions(+), 20 deletions(-)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index bcc9809b52fab..eb402a5681c53 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2687,26 +2687,28 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
break;
default:
- if (Size <= 64) {
- // For sizes up to 64 bytes, greedily use the largest possible loads.
- uint64_t Remaining = Size;
- uint64_t Offset = 0;
-
- const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
- LoadStoreOps = {
- {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
- {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
- {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
- {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
- {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
-
- for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
- while (Remaining >= OpSize) {
- AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
- Remaining -= OpSize;
- Offset += OpSize;
- }
- }
+ // For sizes up to 64 bytes, greedily use the largest possible loads.
+ // Caller should have already filtered out sizes > 64 bytes.
+ assert(Size <= 64 &&
+ "Size should be <= 64 bytes for AArch64 memcpy inlining");
+
+ uint64_t Remaining = Size;
+ uint64_t Offset = 0;
+
+ const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
+ LoadStoreOps = {
+ {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
+ {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
+ {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
+ {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
+ {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
+
+ for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
+ while (Remaining >= OpSize) {
+ AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
+ Remaining -= OpSize;
+ Offset += OpSize;
+ }
break;
}
return Code;
More information about the llvm-commits
mailing list