[llvm] [BOLT][AArch64] Enabling Inlining for Memcpy for AArch64 in BOLT (PR #154929)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 28 10:12:06 PDT 2025
https://github.com/yafet-a updated https://github.com/llvm/llvm-project/pull/154929
>From ce56f84aa7c86e1b35cf0ca4218a1f23702a206e Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 10:12:03 -0700
Subject: [PATCH 01/17] pre-commit test
---
bolt/test/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++++++++++
1 file changed, 193 insertions(+)
create mode 100644 bolt/test/AArch64/inline-memcpy.s
diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
new file mode 100644
index 0000000000000..3bb498e600fb6
--- /dev/null
+++ b/bolt/test/AArch64/inline-memcpy.s
@@ -0,0 +1,193 @@
+## This test checks that BOLT correctly inlines memcpy calls on AArch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
+
+# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+
+# Each function should use optimal size-specific instructions and NO memcpy calls
+
+# 1-byte copy should use single byte load/store (ldrb/strb)
+# CHECK-ASM-LABEL: <test_1_byte_direct>:
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 2-byte copy should use single 16-bit load/store (ldrh/strh)
+# CHECK-ASM-LABEL: <test_2_byte_direct>:
+# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 4-byte copy should use single 32-bit load/store (w register)
+# CHECK-ASM-LABEL: <test_4_byte_direct>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 8-byte copy should use single 64-bit load/store (x register)
+# CHECK-ASM-LABEL: <test_8_byte_direct>:
+# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 16-byte copy should use single 128-bit SIMD load/store (q register)
+# CHECK-ASM-LABEL: <test_16_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 32-byte copy should use two 128-bit SIMD operations
+# CHECK-ASM-LABEL: <test_32_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
+# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# CHECK-ASM-LABEL: <test_128_byte_too_large>:
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+
+ .text
+ .globl test_1_byte_direct
+ .type test_1_byte_direct, at function
+test_1_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #1
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_1_byte_direct, .-test_1_byte_direct
+
+ .globl test_2_byte_direct
+ .type test_2_byte_direct, at function
+test_2_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #2
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_2_byte_direct, .-test_2_byte_direct
+
+ .globl test_4_byte_direct
+ .type test_4_byte_direct, at function
+test_4_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #4
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_4_byte_direct, .-test_4_byte_direct
+
+ .globl test_8_byte_direct
+ .type test_8_byte_direct, at function
+test_8_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #8
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_8_byte_direct, .-test_8_byte_direct
+
+ .globl test_16_byte_direct
+ .type test_16_byte_direct, at function
+test_16_byte_direct:
+ stp x29, x30, [sp, #-48]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #32
+ mov x2, #16
+ bl memcpy
+ ldp x29, x30, [sp], #48
+ ret
+ .size test_16_byte_direct, .-test_16_byte_direct
+
+ .globl test_32_byte_direct
+ .type test_32_byte_direct, at function
+test_32_byte_direct:
+ stp x29, x30, [sp, #-80]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #48
+ mov x2, #32
+ bl memcpy
+ ldp x29, x30, [sp], #80
+ ret
+ .size test_32_byte_direct, .-test_32_byte_direct
+
+ .globl test_37_byte_arbitrary
+ .type test_37_byte_arbitrary, at function
+test_37_byte_arbitrary:
+ stp x29, x30, [sp, #-96]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #56
+ mov x2, #37
+ bl memcpy
+ ldp x29, x30, [sp], #96
+ ret
+ .size test_37_byte_arbitrary, .-test_37_byte_arbitrary
+
+ .globl test_128_byte_too_large
+ .type test_128_byte_too_large, at function
+test_128_byte_too_large:
+ stp x29, x30, [sp, #-288]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #152
+ mov x2, #128
+ bl memcpy
+ ldp x29, x30, [sp], #288
+ ret
+ .size test_128_byte_too_large, .-test_128_byte_too_large
+
+ .globl main
+ .type main, at function
+main:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ bl test_1_byte_direct
+ bl test_2_byte_direct
+ bl test_4_byte_direct
+ bl test_8_byte_direct
+ bl test_16_byte_direct
+ bl test_32_byte_direct
+ bl test_37_byte_arbitrary
+ bl test_128_byte_too_large
+
+ mov w0, #0
+ ldp x29, x30, [sp], #16
+ ret
+ .size main, .-main
>From 1c27d8967a1938cea4e9bf3110362cb91d7b3bbb Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 10:17:40 -0700
Subject: [PATCH 02/17] [BOLT] documentation
---
bolt/docs/CommandLineArgumentReference.md | 2 +-
bolt/lib/Rewrite/BinaryPassManager.cpp | 4 +++-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index f3881c9a640a9..3fc0594514f6e 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -631,7 +631,7 @@
- `--inline-memcpy`
- Inline memcpy using 'rep movsb' instruction (X86-only)
+ Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)
- `--inline-small-functions`
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 996d2e972599d..6b554598cf1bc 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -247,7 +247,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),
static cl::opt<bool> StringOps(
"inline-memcpy",
- cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
+ cl::desc(
+ "inline memcpy using size-specific optimized instructions "
+ "(X86: 'rep movsb', AArch64: width-optimized register operations)"),
cl::cat(BoltOptCategory));
static cl::opt<bool> StripRepRet(
>From db353b759b298aed2e0ebf86f99d6049a5a62e12 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 11:25:05 -0700
Subject: [PATCH 03/17] [BOLT][AArch64] Implement safe size-aware memcpy
inlining
---
bolt/include/bolt/Core/MCPlusBuilder.h | 16 ++
bolt/lib/Passes/BinaryPasses.cpp | 28 ++-
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 204 ++++++++++++++++++
3 files changed, 246 insertions(+), 2 deletions(-)
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index e773250ce8734..6cbf288f3b8f4 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1895,6 +1895,22 @@ class MCPlusBuilder {
return {};
}
+ /// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
+ /// generates optimized code for that specific size. Falls back to regular
+ /// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
+ virtual InstructionListType
+ createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
+ return createInlineMemcpy(ReturnEnd);
+ }
+
+ /// Extract immediate value from move instruction that sets the given
+ /// register. Returns the immediate value if the instruction is a
+ /// move-immediate to TargetReg.
+ virtual std::optional<uint64_t>
+ extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
+ return std::nullopt;
+ }
+
/// Create a target-specific relocation out of the \p Fixup.
/// Note that not every fixup could be converted into a relocation.
virtual std::optional<Relocation>
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d7f02b9470030..0068c1ad0bf1c 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
}
Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
- if (!BC.isX86())
+ if (!BC.isX86() && !BC.isAArch64())
return Error::success();
uint64_t NumInlined = 0;
@@ -1866,8 +1866,32 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
const bool IsTailCall = BC.MIB->isTailCall(Inst);
+ // Extract the size of thecopy from preceding instructions by looking
+ // for writes to the size register
+ std::optional<uint64_t> KnownSize = std::nullopt;
+ BitVector WrittenRegs(BC.MRI->getNumRegs());
+
+ // Get the size register (3rd arg register, index 2 for AArch64)
+ MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+
+ // Look backwards through the basic block for size-setting instr
+ for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
+ MCInst &Inst = *InstIt;
+ WrittenRegs.reset(); // Clear and check what the instruction writes to
+ BC.MIB->getWrittenRegs(Inst, WrittenRegs);
+
+ // Check for writes to the size register
+ if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
+ if (std::optional<uint64_t> ExtractedSize =
+ BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
+ KnownSize = *ExtractedSize;
+ break;
+ }
+ }
+ }
+
const InstructionListType NewCode =
- BC.MIB->createInlineMemcpy(IsMemcpy8);
+ BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
II = BB.replaceInstruction(II, NewCode);
std::advance(II, NewCode.size() - 1);
if (IsTailCall) {
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 973261765f951..03f62117ea096 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2597,6 +2597,210 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
getInstructionSize(const MCInst &Inst) const override {
return 4;
}
+
+ InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
+ // Fallback
+ return createInlineMemcpy(ReturnEnd, std::nullopt);
+ }
+
+ std::optional<uint64_t>
+ extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
+ if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) {
+ if (Inst.getOperand(0).isReg() &&
+ Inst.getOperand(0).getReg() == TargetReg &&
+ Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
+ Inst.getOperand(2).getImm() == 0) {
+ return Inst.getOperand(1).getImm();
+ }
+ }
+ return std::nullopt;
+ }
+
+ InstructionListType
+ createInlineMemcpy(bool ReturnEnd,
+ std::optional<uint64_t> KnownSize) const override {
+ InstructionListType Code;
+ if (ReturnEnd) {
+ if (KnownSize.has_value() && (*KnownSize >> 12) == 0) {
+ // Use immediate if size is known and fits in 12-bit immediate (0-4095)
+ Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::X0)
+ .addImm(*KnownSize)
+ .addImm(0));
+ } else {
+ // Fall back to register add for unknown or large sizes
+ Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::X2));
+ }
+ }
+
+ if (!KnownSize.has_value()) {
+ return Code;
+ }
+
+ uint64_t Size = *KnownSize;
+ return generateSizeSpecificMemcpy(Code, Size);
+ }
+
+ InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
+ uint64_t Size) const {
+ // Generate optimal instruction sequences based on exact size
+ switch (Size) {
+ case 1:
+ // Single byte copy
+ Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 2:
+ // 2-byte copy using 16-bit load/store
+ Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 4:
+ // 4-byte copy using 32-bit load/store
+ Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRWui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 8:
+ // 8-byte copy using 64-bit load/store
+ Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
+ .addReg(AArch64::X3)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRXui)
+ .addReg(AArch64::X3)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 16:
+ // 16-byte copy using 128-bit SIMD
+ Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ break;
+
+ case 32:
+ // 32-byte copy using two 128-bit SIMD operations
+ Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X1)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X0)
+ .addImm(0));
+ Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+ .addReg(AArch64::Q1)
+ .addReg(AArch64::X1)
+ .addImm(1));
+ Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+ .addReg(AArch64::Q1)
+ .addReg(AArch64::X0)
+ .addImm(1));
+ break;
+
+ default:
+ if (Size <= 64) {
+ // For sizes up to 64 bytes, greedily use the largest possible loads in
+ // descending order
+ uint64_t Remaining = Size;
+ uint64_t Offset = 0;
+
+ while (Remaining >= 16) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X1)
+ .addImm(Offset / 16));
+ Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+ .addReg(AArch64::Q0)
+ .addReg(AArch64::X0)
+ .addImm(Offset / 16));
+ Remaining -= 16;
+ Offset += 16;
+ }
+ if (Remaining >= 8) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
+ .addReg(AArch64::X3)
+ .addReg(AArch64::X1)
+ .addImm(Offset / 8));
+ Code.emplace_back(MCInstBuilder(AArch64::STRXui)
+ .addReg(AArch64::X3)
+ .addReg(AArch64::X0)
+ .addImm(Offset / 8));
+ Remaining -= 8;
+ Offset += 8;
+ }
+ if (Remaining >= 4) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(Offset / 4));
+ Code.emplace_back(MCInstBuilder(AArch64::STRWui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(Offset / 4));
+ Remaining -= 4;
+ Offset += 4;
+ }
+ if (Remaining >= 2) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(Offset / 2));
+ Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(Offset / 2));
+ Remaining -= 2;
+ Offset += 2;
+ }
+ if (Remaining == 1) {
+ Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X1)
+ .addImm(Offset));
+ Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
+ .addReg(AArch64::W3)
+ .addReg(AArch64::X0)
+ .addImm(Offset));
+ }
+ } else {
+ Code.clear();
+ }
+ break;
+ }
+ return Code;
+ }
};
} // end anonymous namespace
>From 2e5b22b501a83796ff10ae30520e07cb44b21332 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:14:11 -0700
Subject: [PATCH 04/17] test target fix for CI cross-compilation issue
---
bolt/test/AArch64/inline-memcpy.s | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
index 3bb498e600fb6..e46308286e07b 100644
--- a/bolt/test/AArch64/inline-memcpy.s
+++ b/bolt/test/AArch64/inline-memcpy.s
@@ -1,6 +1,6 @@
## This test checks that BOLT correctly inlines memcpy calls on AArch64.
-# REQUIRES: system-linux
+# REQUIRES: system-linux, aarch64-registered-target
# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q
>From 385fa23691e05fbdb6ffb24cc6a9526ff8d08020 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:49:37 -0700
Subject: [PATCH 05/17] moved inline-memcpy to avoid CI cross-compilation PIE
conflicts
---
bolt/test/runtime/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++
1 file changed, 193 insertions(+)
create mode 100644 bolt/test/runtime/AArch64/inline-memcpy.s
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
new file mode 100644
index 0000000000000..0e16b6a7e963f
--- /dev/null
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -0,0 +1,193 @@
+## This test checks that BOLT correctly inlines memcpy calls on AArch64.
+
+# REQUIRES: system-linux, aarch64-registered-target
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
+
+# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+
+# Each function should use optimal size-specific instructions and NO memcpy calls
+
+# 1-byte copy should use single byte load/store (ldrb/strb)
+# CHECK-ASM-LABEL: <test_1_byte_direct>:
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 2-byte copy should use single 16-bit load/store (ldrh/strh)
+# CHECK-ASM-LABEL: <test_2_byte_direct>:
+# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 4-byte copy should use single 32-bit load/store (w register)
+# CHECK-ASM-LABEL: <test_4_byte_direct>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 8-byte copy should use single 64-bit load/store (x register)
+# CHECK-ASM-LABEL: <test_8_byte_direct>:
+# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 16-byte copy should use single 128-bit SIMD load/store (q register)
+# CHECK-ASM-LABEL: <test_16_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 32-byte copy should use two 128-bit SIMD operations
+# CHECK-ASM-LABEL: <test_32_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
+# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# CHECK-ASM-LABEL: <test_128_byte_too_large>:
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+
+ .text
+ .globl test_1_byte_direct
+ .type test_1_byte_direct, at function
+test_1_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #1
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_1_byte_direct, .-test_1_byte_direct
+
+ .globl test_2_byte_direct
+ .type test_2_byte_direct, at function
+test_2_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #2
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_2_byte_direct, .-test_2_byte_direct
+
+ .globl test_4_byte_direct
+ .type test_4_byte_direct, at function
+test_4_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #4
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_4_byte_direct, .-test_4_byte_direct
+
+ .globl test_8_byte_direct
+ .type test_8_byte_direct, at function
+test_8_byte_direct:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #8
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_8_byte_direct, .-test_8_byte_direct
+
+ .globl test_16_byte_direct
+ .type test_16_byte_direct, at function
+test_16_byte_direct:
+ stp x29, x30, [sp, #-48]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #32
+ mov x2, #16
+ bl memcpy
+ ldp x29, x30, [sp], #48
+ ret
+ .size test_16_byte_direct, .-test_16_byte_direct
+
+ .globl test_32_byte_direct
+ .type test_32_byte_direct, at function
+test_32_byte_direct:
+ stp x29, x30, [sp, #-80]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #48
+ mov x2, #32
+ bl memcpy
+ ldp x29, x30, [sp], #80
+ ret
+ .size test_32_byte_direct, .-test_32_byte_direct
+
+ .globl test_37_byte_arbitrary
+ .type test_37_byte_arbitrary, at function
+test_37_byte_arbitrary:
+ stp x29, x30, [sp, #-96]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #56
+ mov x2, #37
+ bl memcpy
+ ldp x29, x30, [sp], #96
+ ret
+ .size test_37_byte_arbitrary, .-test_37_byte_arbitrary
+
+ .globl test_128_byte_too_large
+ .type test_128_byte_too_large, at function
+test_128_byte_too_large:
+ stp x29, x30, [sp, #-288]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #152
+ mov x2, #128
+ bl memcpy
+ ldp x29, x30, [sp], #288
+ ret
+ .size test_128_byte_too_large, .-test_128_byte_too_large
+
+ .globl main
+ .type main, at function
+main:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ bl test_1_byte_direct
+ bl test_2_byte_direct
+ bl test_4_byte_direct
+ bl test_8_byte_direct
+ bl test_16_byte_direct
+ bl test_32_byte_direct
+ bl test_37_byte_arbitrary
+ bl test_128_byte_too_large
+
+ mov w0, #0
+ ldp x29, x30, [sp], #16
+ ret
+ .size main, .-main
>From 4f9ef678f0d07e23a362cf28805749d53bc8b0b5 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:56:47 -0700
Subject: [PATCH 06/17] removed old test
---
bolt/test/AArch64/inline-memcpy.s | 193 ------------------------------
1 file changed, 193 deletions(-)
delete mode 100644 bolt/test/AArch64/inline-memcpy.s
diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
deleted file mode 100644
index e46308286e07b..0000000000000
--- a/bolt/test/AArch64/inline-memcpy.s
+++ /dev/null
@@ -1,193 +0,0 @@
-## This test checks that BOLT correctly inlines memcpy calls on AArch64.
-
-# REQUIRES: system-linux, aarch64-registered-target
-
-# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
-# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
-# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-
-# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
-# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
-
-# Each function should use optimal size-specific instructions and NO memcpy calls
-
-# 1-byte copy should use single byte load/store (ldrb/strb)
-# CHECK-ASM-LABEL: <test_1_byte_direct>:
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 2-byte copy should use single 16-bit load/store (ldrh/strh)
-# CHECK-ASM-LABEL: <test_2_byte_direct>:
-# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 4-byte copy should use single 32-bit load/store (w register)
-# CHECK-ASM-LABEL: <test_4_byte_direct>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 8-byte copy should use single 64-bit load/store (x register)
-# CHECK-ASM-LABEL: <test_8_byte_direct>:
-# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 16-byte copy should use single 128-bit SIMD load/store (q register)
-# CHECK-ASM-LABEL: <test_16_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 32-byte copy should use two 128-bit SIMD operations
-# CHECK-ASM-LABEL: <test_32_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
-# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
-# CHECK-ASM-LABEL: <test_128_byte_too_large>:
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
-
- .text
- .globl test_1_byte_direct
- .type test_1_byte_direct, at function
-test_1_byte_direct:
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #8
- mov x2, #1
- bl memcpy
- ldp x29, x30, [sp], #32
- ret
- .size test_1_byte_direct, .-test_1_byte_direct
-
- .globl test_2_byte_direct
- .type test_2_byte_direct, at function
-test_2_byte_direct:
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #8
- mov x2, #2
- bl memcpy
- ldp x29, x30, [sp], #32
- ret
- .size test_2_byte_direct, .-test_2_byte_direct
-
- .globl test_4_byte_direct
- .type test_4_byte_direct, at function
-test_4_byte_direct:
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #8
- mov x2, #4
- bl memcpy
- ldp x29, x30, [sp], #32
- ret
- .size test_4_byte_direct, .-test_4_byte_direct
-
- .globl test_8_byte_direct
- .type test_8_byte_direct, at function
-test_8_byte_direct:
- stp x29, x30, [sp, #-32]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #8
- mov x2, #8
- bl memcpy
- ldp x29, x30, [sp], #32
- ret
- .size test_8_byte_direct, .-test_8_byte_direct
-
- .globl test_16_byte_direct
- .type test_16_byte_direct, at function
-test_16_byte_direct:
- stp x29, x30, [sp, #-48]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #32
- mov x2, #16
- bl memcpy
- ldp x29, x30, [sp], #48
- ret
- .size test_16_byte_direct, .-test_16_byte_direct
-
- .globl test_32_byte_direct
- .type test_32_byte_direct, at function
-test_32_byte_direct:
- stp x29, x30, [sp, #-80]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #48
- mov x2, #32
- bl memcpy
- ldp x29, x30, [sp], #80
- ret
- .size test_32_byte_direct, .-test_32_byte_direct
-
- .globl test_37_byte_arbitrary
- .type test_37_byte_arbitrary, at function
-test_37_byte_arbitrary:
- stp x29, x30, [sp, #-96]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #56
- mov x2, #37
- bl memcpy
- ldp x29, x30, [sp], #96
- ret
- .size test_37_byte_arbitrary, .-test_37_byte_arbitrary
-
- .globl test_128_byte_too_large
- .type test_128_byte_too_large, at function
-test_128_byte_too_large:
- stp x29, x30, [sp, #-288]!
- mov x29, sp
- add x1, sp, #16
- add x0, sp, #152
- mov x2, #128
- bl memcpy
- ldp x29, x30, [sp], #288
- ret
- .size test_128_byte_too_large, .-test_128_byte_too_large
-
- .globl main
- .type main, at function
-main:
- stp x29, x30, [sp, #-16]!
- mov x29, sp
-
- bl test_1_byte_direct
- bl test_2_byte_direct
- bl test_4_byte_direct
- bl test_8_byte_direct
- bl test_16_byte_direct
- bl test_32_byte_direct
- bl test_37_byte_arbitrary
- bl test_128_byte_too_large
-
- mov w0, #0
- ldp x29, x30, [sp], #16
- ret
- .size main, .-main
>From e83126edd3dd418086f8341a92609210ba7cb874 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 08:51:18 -0700
Subject: [PATCH 07/17] response to review
---
bolt/lib/Passes/BinaryPasses.cpp | 37 +++--
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 140 ++++--------------
2 files changed, 49 insertions(+), 128 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 0068c1ad0bf1c..e532c2aa0422d 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1866,26 +1866,25 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
const bool IsTailCall = BC.MIB->isTailCall(Inst);
- // Extract the size of thecopy from preceding instructions by looking
- // for writes to the size register
+ // Extract size from preceding instructions (AArch64 only)
+ // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2
std::optional<uint64_t> KnownSize = std::nullopt;
- BitVector WrittenRegs(BC.MRI->getNumRegs());
-
- // Get the size register (3rd arg register, index 2 for AArch64)
- MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
-
- // Look backwards through the basic block for size-setting instr
- for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
- MCInst &Inst = *InstIt;
- WrittenRegs.reset(); // Clear and check what the instruction writes to
- BC.MIB->getWrittenRegs(Inst, WrittenRegs);
-
- // Check for writes to the size register
- if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
- if (std::optional<uint64_t> ExtractedSize =
- BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
- KnownSize = *ExtractedSize;
- break;
+ if (BC.isAArch64()) {
+ BitVector WrittenRegs(BC.MRI->getNumRegs());
+ MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+
+ // Look backwards for size-setting instruction
+ for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
+ MCInst &Inst = *InstIt;
+ WrittenRegs.reset();
+ BC.MIB->getWrittenRegs(Inst, WrittenRegs);
+
+ if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
+ if (std::optional<uint64_t> ExtractedSize =
+ BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
+ KnownSize = *ExtractedSize;
+ break;
+ }
}
}
}
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 03f62117ea096..e640044ec762d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2647,152 +2647,74 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
uint64_t Size) const {
+ // Helper to add load/store pair
+ auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
+ unsigned Reg, unsigned Offset = 0) {
+ Code.emplace_back(MCInstBuilder(LoadOpc)
+ .addReg(Reg)
+ .addReg(AArch64::X1)
+ .addImm(Offset));
+ Code.emplace_back(MCInstBuilder(StoreOpc)
+ .addReg(Reg)
+ .addReg(AArch64::X0)
+ .addImm(Offset));
+ };
+
// Generate optimal instruction sequences based on exact size
switch (Size) {
case 1:
- // Single byte copy
- Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
break;
-
case 2:
- // 2-byte copy using 16-bit load/store
- Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
break;
-
case 4:
- // 4-byte copy using 32-bit load/store
- Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRWui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
break;
-
case 8:
- // 8-byte copy using 64-bit load/store
- Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
- .addReg(AArch64::X3)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRXui)
- .addReg(AArch64::X3)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
break;
-
case 16:
- // 16-byte copy using 128-bit SIMD
- Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X0)
- .addImm(0));
+ addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
break;
-
case 32:
- // 32-byte copy using two 128-bit SIMD operations
- Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X1)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::STRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X0)
- .addImm(0));
- Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
- .addReg(AArch64::Q1)
- .addReg(AArch64::X1)
- .addImm(1));
- Code.emplace_back(MCInstBuilder(AArch64::STRQui)
- .addReg(AArch64::Q1)
- .addReg(AArch64::X0)
- .addImm(1));
+ addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
+ addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
break;
default:
if (Size <= 64) {
- // For sizes up to 64 bytes, greedily use the largest possible loads in
- // descending order
+ // For sizes up to 64 bytes, greedily use the largest possible loads
uint64_t Remaining = Size;
uint64_t Offset = 0;
while (Remaining >= 16) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X1)
- .addImm(Offset / 16));
- Code.emplace_back(MCInstBuilder(AArch64::STRQui)
- .addReg(AArch64::Q0)
- .addReg(AArch64::X0)
- .addImm(Offset / 16));
+ addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0,
+ Offset / 16);
Remaining -= 16;
Offset += 16;
}
if (Remaining >= 8) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
- .addReg(AArch64::X3)
- .addReg(AArch64::X1)
- .addImm(Offset / 8));
- Code.emplace_back(MCInstBuilder(AArch64::STRXui)
- .addReg(AArch64::X3)
- .addReg(AArch64::X0)
- .addImm(Offset / 8));
+ addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3,
+ Offset / 8);
Remaining -= 8;
Offset += 8;
}
if (Remaining >= 4) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(Offset / 4));
- Code.emplace_back(MCInstBuilder(AArch64::STRWui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(Offset / 4));
+ addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3,
+ Offset / 4);
Remaining -= 4;
Offset += 4;
}
if (Remaining >= 2) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(Offset / 2));
- Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(Offset / 2));
+ addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3,
+ Offset / 2);
Remaining -= 2;
Offset += 2;
}
if (Remaining == 1) {
- Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X1)
- .addImm(Offset));
- Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
- .addReg(AArch64::W3)
- .addReg(AArch64::X0)
- .addImm(Offset));
+ addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
+ Offset);
}
} else {
Code.clear();
>From cf8279a8b5081eec657a1f835c54470653186787 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 03:57:43 -0700
Subject: [PATCH 08/17] Update conditional formatting and move check for size
into binaryPasses
---
bolt/lib/Passes/BinaryPasses.cpp | 5 +++++
bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 13 ++++---------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e532c2aa0422d..1aade44286052 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1889,6 +1889,11 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
}
}
+ if (BC.isAArch64() && !KnownSize.has_value()) {
+ ++II;
+ continue;
+ }
+
const InstructionListType NewCode =
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
II = BB.replaceInstruction(II, NewCode);
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index e640044ec762d..9d30fdface0c5 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2621,24 +2621,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
std::optional<uint64_t> KnownSize) const override {
InstructionListType Code;
if (ReturnEnd) {
- if (KnownSize.has_value() && (*KnownSize >> 12) == 0) {
- // Use immediate if size is known and fits in 12-bit immediate (0-4095)
+ // Use immediate if size fits in 12-bit immediate (0-4095)
+ // Otherwise, fall back to register add for large sizes
+ if ((*KnownSize >> 12) == 0)
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X0)
.addReg(AArch64::X0)
.addImm(*KnownSize)
.addImm(0));
- } else {
- // Fall back to register add for unknown or large sizes
+ else
Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
.addReg(AArch64::X0)
.addReg(AArch64::X0)
.addReg(AArch64::X2));
- }
- }
-
- if (!KnownSize.has_value()) {
- return Code;
}
uint64_t Size = *KnownSize;
>From c317eb0cbd62ac6f164cf44b75d40e082167ce3d Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 04:55:17 -0700
Subject: [PATCH 09/17] Negative Tests (live-in, register move, non-mov
instruction)
---
bolt/test/runtime/AArch64/inline-memcpy.s | 61 ++++++++++++++++++++++-
1 file changed, 60 insertions(+), 1 deletion(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 0e16b6a7e963f..417b444f6a4bb 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,7 +7,7 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls)
# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
@@ -67,6 +67,18 @@
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+# ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain)
+# CHECK-ASM-LABEL: <test_4_byte_add_immediate>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Register move should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_register_move_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Live-in parameter should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_live_in_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
.text
.globl test_1_byte_direct
.type test_1_byte_direct, at function
@@ -172,6 +184,50 @@ test_128_byte_too_large:
ret
.size test_128_byte_too_large, .-test_128_byte_too_large
+ .globl test_4_byte_add_immediate
+ .type test_4_byte_add_immediate, at function
+test_4_byte_add_immediate:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x3, #0
+ add x2, x3, #4
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_4_byte_add_immediate, .-test_4_byte_add_immediate
+
+ .globl test_register_move_negative
+ .type test_register_move_negative, at function
+test_register_move_negative:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x6, #4
+ mov x2, x6
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_register_move_negative, .-test_register_move_negative
+
+ .globl test_live_in_negative
+ .type test_live_in_negative, at function
+test_live_in_negative:
+ # x2 comes in as parameter, no instruction sets it (should NOT inline)
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ # x2 is live-in, no size-setting instruction
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_live_in_negative, .-test_live_in_negative
+
+
+
.globl main
.type main, at function
main:
@@ -186,6 +242,9 @@ main:
bl test_32_byte_direct
bl test_37_byte_arbitrary
bl test_128_byte_too_large
+ bl test_4_byte_add_immediate
+ bl test_register_move_negative
+ bl test_live_in_negative
mov w0, #0
ldp x29, x30, [sp], #16
>From df97d61befcc9ceaf3d82648a1b68b88cc3e0451 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 06:51:08 -0700
Subject: [PATCH 10/17] memcpy8 redundant handling removed
---
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 29 ++++++---------
bolt/test/runtime/AArch64/inline-memcpy.s | 37 ++++++++++++++++++-
2 files changed, 47 insertions(+), 19 deletions(-)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9d30fdface0c5..366d4183bca51 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2620,24 +2620,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
createInlineMemcpy(bool ReturnEnd,
std::optional<uint64_t> KnownSize) const override {
InstructionListType Code;
- if (ReturnEnd) {
- // Use immediate if size fits in 12-bit immediate (0-4095)
- // Otherwise, fall back to register add for large sizes
- if ((*KnownSize >> 12) == 0)
- Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
- .addReg(AArch64::X0)
- .addReg(AArch64::X0)
- .addImm(*KnownSize)
- .addImm(0));
- else
- Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
- .addReg(AArch64::X0)
- .addReg(AArch64::X0)
- .addReg(AArch64::X2));
- }
-
uint64_t Size = *KnownSize;
- return generateSizeSpecificMemcpy(Code, Size);
+
+ // Generate the optimized memcpy sequence
+ generateSizeSpecificMemcpy(Code, Size);
+
+ // If _memcpy8, adjust X0 to return dest+size instead of dest
+ if (ReturnEnd)
+ Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
+ .addReg(AArch64::X0)
+ .addReg(AArch64::X0)
+ .addImm(Size)
+ .addImm(0));
+ return Code;
}
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 417b444f6a4bb..961e21f82851d 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
@@ -79,6 +79,13 @@
# CHECK-ASM-LABEL: <test_live_in_negative>:
# CHECK-ASM: bl{{.*}}<memcpy
+# _memcpy8 should be inlined with end-pointer return (dest+size)
+# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: add{{.*}}x0, x0, #0x4
+# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
+
.text
.globl test_1_byte_direct
.type test_1_byte_direct, at function
@@ -226,7 +233,31 @@ test_live_in_negative:
ret
.size test_live_in_negative, .-test_live_in_negative
+ .globl test_memcpy8_4_byte
+ .type test_memcpy8_4_byte, at function
+test_memcpy8_4_byte:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #4
+ bl _memcpy8
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_memcpy8_4_byte, .-test_memcpy8_4_byte
+ # Simple _memcpy8 implementation that calls memcpy and returns dest+size
+ .globl _memcpy8
+ .type _memcpy8, at function
+_memcpy8:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ mov x3, x0
+ bl memcpy
+ add x0, x3, x2
+ ldp x29, x30, [sp], #16
+ ret
+ .size _memcpy8, .-_memcpy8
.globl main
.type main, at function
@@ -245,6 +276,8 @@ main:
bl test_4_byte_add_immediate
bl test_register_move_negative
bl test_live_in_negative
+ bl test_memcpy8_4_byte
+ bl test_memcpy8_large_size
mov w0, #0
ldp x29, x30, [sp], #16
>From 25cfb58b165fd1190f9b1b52cce1423d2db5d3c1 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 06:54:14 -0700
Subject: [PATCH 11/17] nit: comment clean up
---
bolt/lib/Passes/BinaryPasses.cpp | 6 +++---
bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 9 ++++-----
2 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 1aade44286052..e8124dd3cb4f4 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1866,14 +1866,14 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
const bool IsTailCall = BC.MIB->isTailCall(Inst);
- // Extract size from preceding instructions (AArch64 only)
- // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2
+ // Extract size from preceding instructions (AArch64 only).
+ // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
std::optional<uint64_t> KnownSize = std::nullopt;
if (BC.isAArch64()) {
BitVector WrittenRegs(BC.MRI->getNumRegs());
MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
- // Look backwards for size-setting instruction
+ // Look backwards for size-setting instruction.
for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
MCInst &Inst = *InstIt;
WrittenRegs.reset();
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 366d4183bca51..67febc2324e14 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2622,10 +2622,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType Code;
uint64_t Size = *KnownSize;
- // Generate the optimized memcpy sequence
+ // Generate the optimized memcpy sequence.
generateSizeSpecificMemcpy(Code, Size);
- // If _memcpy8, adjust X0 to return dest+size instead of dest
+ // If _memcpy8, adjust X0 to return dest+size instead of dest.
if (ReturnEnd)
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X0)
@@ -2637,7 +2637,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
uint64_t Size) const {
- // Helper to add load/store pair
auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
unsigned Reg, unsigned Offset = 0) {
Code.emplace_back(MCInstBuilder(LoadOpc)
@@ -2650,7 +2649,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
.addImm(Offset));
};
- // Generate optimal instruction sequences based on exact size
+ // Generate optimal instruction sequences based on exact size.
switch (Size) {
case 1:
addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
@@ -2674,7 +2673,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
default:
if (Size <= 64) {
- // For sizes up to 64 bytes, greedily use the largest possible loads
+ // For sizes up to 64 bytes, greedily use the largest possible loads.
uint64_t Remaining = Size;
uint64_t Offset = 0;
>From e308855758965504cca82484f66065d186c64093 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 02:12:27 -0700
Subject: [PATCH 12/17] minor refactor
---
bolt/lib/Passes/BinaryPasses.cpp | 11 +++++-----
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 22 +++++++------------
2 files changed, 13 insertions(+), 20 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e8124dd3cb4f4..022d06ae80e7b 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1872,6 +1872,7 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
if (BC.isAArch64()) {
BitVector WrittenRegs(BC.MRI->getNumRegs());
MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+ std::optional<uint64_t> ExtractedSize;
// Look backwards for size-setting instruction.
for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
@@ -1879,12 +1880,10 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
WrittenRegs.reset();
BC.MIB->getWrittenRegs(Inst, WrittenRegs);
- if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
- if (std::optional<uint64_t> ExtractedSize =
- BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
- KnownSize = *ExtractedSize;
- break;
- }
+ if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] &&
+ (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) {
+ KnownSize = *ExtractedSize;
+ break;
}
}
}
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 67febc2324e14..dfb5fe3cfe30d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2599,20 +2599,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
}
InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
- // Fallback
return createInlineMemcpy(ReturnEnd, std::nullopt);
}
std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
- if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) {
- if (Inst.getOperand(0).isReg() &&
- Inst.getOperand(0).getReg() == TargetReg &&
- Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
- Inst.getOperand(2).getImm() == 0) {
- return Inst.getOperand(1).getImm();
- }
- }
+ if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 &&
+ Inst.getOperand(0).isReg() &&
+ Inst.getOperand(0).getReg() == TargetReg &&
+ Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
+ Inst.getOperand(2).getImm() == 0)
+ return Inst.getOperand(1).getImm();
return std::nullopt;
}
@@ -2622,7 +2619,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType Code;
uint64_t Size = *KnownSize;
- // Generate the optimized memcpy sequence.
generateSizeSpecificMemcpy(Code, Size);
// If _memcpy8, adjust X0 to return dest+size instead of dest.
@@ -2701,13 +2697,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
Remaining -= 2;
Offset += 2;
}
- if (Remaining == 1) {
+ if (Remaining == 1)
addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
Offset);
- }
- } else {
+ } else
Code.clear();
- }
break;
}
return Code;
>From 365a0bfaa0d68e9a5c45f9b5163af49ca6d5c1b8 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 06:33:39 -0700
Subject: [PATCH 13/17] NFC: Post-review refactor
---
bolt/include/bolt/Core/MCPlusBuilder.h | 10 +++
bolt/lib/Passes/BinaryPasses.cpp | 21 +----
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 81 ++++++++++---------
3 files changed, 55 insertions(+), 57 deletions(-)
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 6cbf288f3b8f4..3192472f5fbe0 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -14,6 +14,7 @@
#ifndef BOLT_CORE_MCPLUSBUILDER_H
#define BOLT_CORE_MCPLUSBUILDER_H
+#include "bolt/Core/BinaryBasicBlock.h"
#include "bolt/Core/MCPlus.h"
#include "bolt/Core/Relocation.h"
#include "llvm/ADT/ArrayRef.h"
@@ -1888,6 +1889,15 @@ class MCPlusBuilder {
return {};
}
+ /// Find memcpy size in bytes by using preceding instructions.
+ /// Returns std::nullopt if size cannot be determined (no-op for most
+ /// targets).
+ virtual std::optional<uint64_t>
+ findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+ BinaryBasicBlock::iterator CallInst) const {
+ return std::nullopt;
+ }
+
/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
/// (dest + n) instead of dest.
virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 022d06ae80e7b..f1807f6eb997e 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1868,25 +1868,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
// Extract size from preceding instructions (AArch64 only).
// Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
- std::optional<uint64_t> KnownSize = std::nullopt;
- if (BC.isAArch64()) {
- BitVector WrittenRegs(BC.MRI->getNumRegs());
- MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
- std::optional<uint64_t> ExtractedSize;
-
- // Look backwards for size-setting instruction.
- for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
- MCInst &Inst = *InstIt;
- WrittenRegs.reset();
- BC.MIB->getWrittenRegs(Inst, WrittenRegs);
-
- if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] &&
- (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) {
- KnownSize = *ExtractedSize;
- break;
- }
- }
- }
+ std::optional<uint64_t> KnownSize =
+ BC.MIB->findMemcpySizeInBytes(BB, II);
if (BC.isAArch64() && !KnownSize.has_value()) {
++II;
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index dfb5fe3cfe30d..6f539b8588f2e 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2604,15 +2604,33 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
- if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 &&
- Inst.getOperand(0).isReg() &&
+ // Match MOVZXi with the target register and no shift.
+ if (Inst.getOpcode() == AArch64::MOVZXi &&
Inst.getOperand(0).getReg() == TargetReg &&
- Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
Inst.getOperand(2).getImm() == 0)
return Inst.getOperand(1).getImm();
return std::nullopt;
}
+ std::optional<uint64_t>
+ findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+ BinaryBasicBlock::iterator CallInst) const override {
+ BitVector WrittenRegs(RegInfo->getNumRegs());
+ MCPhysReg SizeReg = getIntArgRegister(2);
+ std::optional<uint64_t> ExtractedSize;
+
+ for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
+ const MCInst &Inst = *InstIt;
+ WrittenRegs.reset();
+ getWrittenRegs(Inst, WrittenRegs);
+
+ if (SizeReg != getNoRegister() && WrittenRegs[SizeReg] &&
+ (ExtractedSize = extractMoveImmediate(Inst, SizeReg)))
+ return *ExtractedSize;
+ }
+ return std::nullopt;
+ }
+
InstructionListType
createInlineMemcpy(bool ReturnEnd,
std::optional<uint64_t> KnownSize) const override {
@@ -2633,7 +2651,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
uint64_t Size) const {
- auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
+ auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
unsigned Reg, unsigned Offset = 0) {
Code.emplace_back(MCInstBuilder(LoadOpc)
.addReg(Reg)
@@ -2648,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
// Generate optimal instruction sequences based on exact size.
switch (Size) {
case 1:
- addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
break;
case 2:
- addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
break;
case 4:
- addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
break;
case 8:
- addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
+ AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
break;
case 16:
- addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
break;
case 32:
- addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
- addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
break;
default:
@@ -2673,33 +2691,20 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
uint64_t Remaining = Size;
uint64_t Offset = 0;
- while (Remaining >= 16) {
- addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0,
- Offset / 16);
- Remaining -= 16;
- Offset += 16;
- }
- if (Remaining >= 8) {
- addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3,
- Offset / 8);
- Remaining -= 8;
- Offset += 8;
- }
- if (Remaining >= 4) {
- addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3,
- Offset / 4);
- Remaining -= 4;
- Offset += 4;
- }
- if (Remaining >= 2) {
- addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3,
- Offset / 2);
- Remaining -= 2;
- Offset += 2;
- }
- if (Remaining == 1)
- addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
- Offset);
+ const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
+ LoadStoreOps = {
+ {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0},
+ {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3},
+ {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3},
+ {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3},
+ {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}};
+
+ for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
+ while (Remaining >= OpSize) {
+ AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
+ Remaining -= OpSize;
+ Offset += OpSize;
+ }
} else
Code.clear();
break;
>From 84c904ac68b263b48227b3308ad16c795382b7c3 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 06:42:47 -0700
Subject: [PATCH 14/17] NFC: Test for corner case with size 0
---
bolt/test/runtime/AArch64/inline-memcpy.s | 25 ++++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 961e21f82851d..3acb5e394d52d 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
@@ -62,6 +62,12 @@
# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
+# CHECK-ASM-LABEL: <test_0_byte>:
+# CHECK-ASM-NOT: ldr
+# CHECK-ASM-NOT: str
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
# CHECK-ASM-LABEL: <test_128_byte_too_large>:
# CHECK-ASM-NOT: bl{{.*}}<memcpy
@@ -178,6 +184,19 @@ test_37_byte_arbitrary:
ret
.size test_37_byte_arbitrary, .-test_37_byte_arbitrary
+ .globl test_0_byte
+ .type test_0_byte, at function
+test_0_byte:
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x2, #0
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_0_byte, .-test_0_byte
+
.globl test_128_byte_too_large
.type test_128_byte_too_large, at function
test_128_byte_too_large:
@@ -272,12 +291,12 @@ main:
bl test_16_byte_direct
bl test_32_byte_direct
bl test_37_byte_arbitrary
+ bl test_0_byte
bl test_128_byte_too_large
bl test_4_byte_add_immediate
bl test_register_move_negative
bl test_live_in_negative
bl test_memcpy8_4_byte
- bl test_memcpy8_large_size
mov w0, #0
ldp x29, x30, [sp], #16
>From 0561bccf755709811eed3d13e10bdcd2afa5fbe3 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:01:21 -0700
Subject: [PATCH 15/17] Use temp instead of argument registers
---
.../Target/AArch64/AArch64MCPlusBuilder.cpp | 24 +++++++++----------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 6f539b8588f2e..f17a91bc3ba76 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2666,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
// Generate optimal instruction sequences based on exact size.
switch (Size) {
case 1:
- AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
break;
case 2:
- AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
break;
case 4:
- AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
+ AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
break;
case 8:
- AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
+ AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
break;
case 16:
- AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
break;
case 32:
- AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
- AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
+ AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
break;
default:
@@ -2693,11 +2693,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
LoadStoreOps = {
- {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0},
- {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3},
- {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3},
- {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3},
- {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}};
+ {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
+ {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
+ {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
+ {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
+ {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
while (Remaining >= OpSize) {
>From cc49db79eea544305571e5e91caa3328c91cf4a7 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:01:54 -0700
Subject: [PATCH 16/17] Update early return
---
bolt/lib/Passes/BinaryPasses.cpp | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index f1807f6eb997e..d40f5fb78c7f3 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1871,10 +1871,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
std::optional<uint64_t> KnownSize =
BC.MIB->findMemcpySizeInBytes(BB, II);
- if (BC.isAArch64() && !KnownSize.has_value()) {
- ++II;
+ if (BC.isAArch64() && !KnownSize.has_value())
continue;
- }
const InstructionListType NewCode =
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
>From 115606be208c8b6675df59b9f231dd709ea863fd Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:02:48 -0700
Subject: [PATCH 17/17] Update tests to be more specific about registers +
negative test on early return check
---
bolt/test/runtime/AArch64/inline-memcpy.s | 70 +++++++++++++++--------
1 file changed, 45 insertions(+), 25 deletions(-)
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 3acb5e394d52d..14a95d91dd189 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,59 +7,59 @@
# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls)
+# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 14 total calls)
# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
# Each function should use optimal size-specific instructions and NO memcpy calls
# 1-byte copy should use single byte load/store (ldrb/strb)
# CHECK-ASM-LABEL: <test_1_byte_direct>:
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldrb{{.*}}w9, [x1]
+# CHECK-ASM: strb{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 2-byte copy should use single 16-bit load/store (ldrh/strh)
# CHECK-ASM-LABEL: <test_2_byte_direct>:
-# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldrh{{.*}}w9, [x1]
+# CHECK-ASM: strh{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 4-byte copy should use single 32-bit load/store (w register)
# CHECK-ASM-LABEL: <test_4_byte_direct>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM: str{{.*}}w9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 8-byte copy should use single 64-bit load/store (x register)
# CHECK-ASM-LABEL: <test_8_byte_direct>:
-# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}x9, [x1]
+# CHECK-ASM: str{{.*}}x9, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 16-byte copy should use single 128-bit SIMD load/store (q register)
# CHECK-ASM-LABEL: <test_16_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 32-byte copy should use two 128-bit SIMD operations
# CHECK-ASM-LABEL: <test_32_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q17, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q17, [x0, #0x10]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w9, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w9, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w9, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w9, [x0, #0x24]
# CHECK-ASM-NOT: bl{{.*}}<memcpy
# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
@@ -85,10 +85,14 @@
# CHECK-ASM-LABEL: <test_live_in_negative>:
# CHECK-ASM: bl{{.*}}<memcpy
+# Register-based size should NOT be inlined (isAArch64 & size unknown at compile time)
+# CHECK-ASM-LABEL: <test_register_size_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
# _memcpy8 should be inlined with end-pointer return (dest+size)
# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM: str{{.*}}w9, [x0]
# CHECK-ASM: add{{.*}}x0, x0, #0x4
# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
@@ -252,6 +256,21 @@ test_live_in_negative:
ret
.size test_live_in_negative, .-test_live_in_negative
+ .globl test_register_size_negative
+ .type test_register_size_negative, at function
+test_register_size_negative:
+ # This would crash without isAArch64() check: size from register parameter
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ add x1, sp, #16
+ add x0, sp, #8
+ mov x3, #4
+ mov x2, x3
+ bl memcpy
+ ldp x29, x30, [sp], #32
+ ret
+ .size test_register_size_negative, .-test_register_size_negative
+
.globl test_memcpy8_4_byte
.type test_memcpy8_4_byte, at function
test_memcpy8_4_byte:
@@ -296,6 +315,7 @@ main:
bl test_4_byte_add_immediate
bl test_register_move_negative
bl test_live_in_negative
+ bl test_register_size_negative
bl test_memcpy8_4_byte
mov w0, #0
More information about the llvm-commits
mailing list