[llvm] [BOLT][AArch64] Enabling Inlining for Memcpy for AArch64 in BOLT (PR #154929)

Thu Aug 28 10:12:06 PDT 2025

https://github.com/yafet-a updated https://github.com/llvm/llvm-project/pull/154929

>From ce56f84aa7c86e1b35cf0ca4218a1f23702a206e Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 10:12:03 -0700
Subject: [PATCH 01/17] pre-commit test

---
 bolt/test/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 bolt/test/AArch64/inline-memcpy.s

diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
new file mode 100644
index 0000000000000..3bb498e600fb6
--- /dev/null
+++ b/bolt/test/AArch64/inline-memcpy.s
@@ -0,0 +1,193 @@
+## This test checks that BOLT correctly inlines memcpy calls on AArch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q  
+# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
+
+# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+
+# Each function should use optimal size-specific instructions and NO memcpy calls
+
+# 1-byte copy should use single byte load/store (ldrb/strb)
+# CHECK-ASM-LABEL: <test_1_byte_direct>:
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 2-byte copy should use single 16-bit load/store (ldrh/strh)
+# CHECK-ASM-LABEL: <test_2_byte_direct>:
+# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 4-byte copy should use single 32-bit load/store (w register)
+# CHECK-ASM-LABEL: <test_4_byte_direct>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 8-byte copy should use single 64-bit load/store (x register)
+# CHECK-ASM-LABEL: <test_8_byte_direct>:
+# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 16-byte copy should use single 128-bit SIMD load/store (q register)
+# CHECK-ASM-LABEL: <test_16_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 32-byte copy should use two 128-bit SIMD operations
+# CHECK-ASM-LABEL: <test_32_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
+# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# CHECK-ASM-LABEL: <test_128_byte_too_large>:
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+
+	.text
+	.globl	test_1_byte_direct                
+	.type	test_1_byte_direct, at function
+test_1_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #1
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_1_byte_direct, .-test_1_byte_direct
+
+	.globl	test_2_byte_direct                
+	.type	test_2_byte_direct, at function
+test_2_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #2
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_2_byte_direct, .-test_2_byte_direct
+
+	.globl	test_4_byte_direct                
+	.type	test_4_byte_direct, at function
+test_4_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #4
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_4_byte_direct, .-test_4_byte_direct
+
+	.globl	test_8_byte_direct                
+	.type	test_8_byte_direct, at function
+test_8_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #8
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_8_byte_direct, .-test_8_byte_direct
+
+	.globl	test_16_byte_direct
+	.type	test_16_byte_direct, at function
+test_16_byte_direct:
+	stp	x29, x30, [sp, #-48]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #32
+	mov	x2, #16
+	bl	memcpy
+	ldp	x29, x30, [sp], #48
+	ret
+	.size	test_16_byte_direct, .-test_16_byte_direct
+
+	.globl	test_32_byte_direct
+	.type	test_32_byte_direct, at function
+test_32_byte_direct:
+	stp	x29, x30, [sp, #-80]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #48
+	mov	x2, #32
+	bl	memcpy
+	ldp	x29, x30, [sp], #80
+	ret
+	.size	test_32_byte_direct, .-test_32_byte_direct
+
+	.globl	test_37_byte_arbitrary
+	.type	test_37_byte_arbitrary, at function
+test_37_byte_arbitrary:
+	stp	x29, x30, [sp, #-96]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #56
+	mov	x2, #37
+	bl	memcpy
+	ldp	x29, x30, [sp], #96
+	ret
+	.size	test_37_byte_arbitrary, .-test_37_byte_arbitrary
+
+	.globl	test_128_byte_too_large
+	.type	test_128_byte_too_large, at function
+test_128_byte_too_large:
+	stp	x29, x30, [sp, #-288]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #152
+	mov	x2, #128
+	bl	memcpy
+	ldp	x29, x30, [sp], #288
+	ret
+	.size	test_128_byte_too_large, .-test_128_byte_too_large
+
+	.globl	main
+	.type	main, at function
+main:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	
+	bl	test_1_byte_direct
+	bl	test_2_byte_direct
+	bl	test_4_byte_direct
+	bl	test_8_byte_direct
+	bl	test_16_byte_direct  
+	bl	test_32_byte_direct
+	bl	test_37_byte_arbitrary
+	bl	test_128_byte_too_large
+	
+	mov	w0, #0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	main, .-main

>From 1c27d8967a1938cea4e9bf3110362cb91d7b3bbb Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 10:17:40 -0700
Subject: [PATCH 02/17] [BOLT] documentation

---
 bolt/docs/CommandLineArgumentReference.md | 2 +-
 bolt/lib/Rewrite/BinaryPassManager.cpp    | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index f3881c9a640a9..3fc0594514f6e 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -631,7 +631,7 @@
 
 - `--inline-memcpy`
 
-  Inline memcpy using 'rep movsb' instruction (X86-only)
+  Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)
 
 - `--inline-small-functions`
 
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 996d2e972599d..6b554598cf1bc 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -247,7 +247,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),
 
 static cl::opt<bool> StringOps(
     "inline-memcpy",
-    cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
+    cl::desc(
+        "inline memcpy using size-specific optimized instructions "
+        "(X86: 'rep movsb', AArch64: width-optimized register operations)"),
     cl::cat(BoltOptCategory));
 
 static cl::opt<bool> StripRepRet(

>From db353b759b298aed2e0ebf86f99d6049a5a62e12 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 11:25:05 -0700
Subject: [PATCH 03/17] [BOLT][AArch64] Implement safe size-aware memcpy
 inlining

---
 bolt/include/bolt/Core/MCPlusBuilder.h        |  16 ++
 bolt/lib/Passes/BinaryPasses.cpp              |  28 ++-
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 204 ++++++++++++++++++
 3 files changed, 246 insertions(+), 2 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index e773250ce8734..6cbf288f3b8f4 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1895,6 +1895,22 @@ class MCPlusBuilder {
     return {};
   }
 
+  /// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
+  /// generates optimized code for that specific size. Falls back to regular
+  /// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
+  virtual InstructionListType
+  createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
+    return createInlineMemcpy(ReturnEnd);
+  }
+
+  /// Extract immediate value from move instruction that sets the given
+  /// register. Returns the immediate value if the instruction is a
+  /// move-immediate to TargetReg.
+  virtual std::optional<uint64_t>
+  extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
+    return std::nullopt;
+  }
+
   /// Create a target-specific relocation out of the \p Fixup.
   /// Note that not every fixup could be converted into a relocation.
   virtual std::optional<Relocation>
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d7f02b9470030..0068c1ad0bf1c 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
 }
 
 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
-  if (!BC.isX86())
+  if (!BC.isX86() && !BC.isAArch64())
     return Error::success();
 
   uint64_t NumInlined = 0;
@@ -1866,8 +1866,32 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
         const bool IsTailCall = BC.MIB->isTailCall(Inst);
 
+        // Extract the size of thecopy from preceding instructions by looking
+        // for writes to the size register
+        std::optional<uint64_t> KnownSize = std::nullopt;
+        BitVector WrittenRegs(BC.MRI->getNumRegs());
+
+        // Get the size register (3rd arg register, index 2 for AArch64)
+        MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+
+        // Look backwards through the basic block for size-setting instr
+        for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
+          MCInst &Inst = *InstIt;
+          WrittenRegs.reset(); // Clear and check what the instruction writes to
+          BC.MIB->getWrittenRegs(Inst, WrittenRegs);
+
+          // Check for writes to the size register
+          if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
+            if (std::optional<uint64_t> ExtractedSize =
+                    BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
+              KnownSize = *ExtractedSize;
+              break;
+            }
+          }
+        }
+
         const InstructionListType NewCode =
-            BC.MIB->createInlineMemcpy(IsMemcpy8);
+            BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
         II = BB.replaceInstruction(II, NewCode);
         std::advance(II, NewCode.size() - 1);
         if (IsTailCall) {
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 973261765f951..03f62117ea096 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2597,6 +2597,210 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   getInstructionSize(const MCInst &Inst) const override {
     return 4;
   }
+
+  InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
+    // Fallback
+    return createInlineMemcpy(ReturnEnd, std::nullopt);
+  }
+
+  std::optional<uint64_t>
+  extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
+    if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) {
+      if (Inst.getOperand(0).isReg() &&
+          Inst.getOperand(0).getReg() == TargetReg &&
+          Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
+          Inst.getOperand(2).getImm() == 0) {
+        return Inst.getOperand(1).getImm();
+      }
+    }
+    return std::nullopt;
+  }
+
+  InstructionListType
+  createInlineMemcpy(bool ReturnEnd,
+                     std::optional<uint64_t> KnownSize) const override {
+    InstructionListType Code;
+    if (ReturnEnd) {
+      if (KnownSize.has_value() && (*KnownSize >> 12) == 0) {
+        // Use immediate if size is known and fits in 12-bit immediate (0-4095)
+        Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
+                              .addReg(AArch64::X0)
+                              .addReg(AArch64::X0)
+                              .addImm(*KnownSize)
+                              .addImm(0));
+      } else {
+        // Fall back to register add for unknown or large sizes
+        Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
+                              .addReg(AArch64::X0)
+                              .addReg(AArch64::X0)
+                              .addReg(AArch64::X2));
+      }
+    }
+
+    if (!KnownSize.has_value()) {
+      return Code;
+    }
+
+    uint64_t Size = *KnownSize;
+    return generateSizeSpecificMemcpy(Code, Size);
+  }
+
+  InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
+                                                 uint64_t Size) const {
+    // Generate optimal instruction sequences based on exact size
+    switch (Size) {
+    case 1:
+      // Single byte copy
+      Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 2:
+      // 2-byte copy using 16-bit load/store
+      Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 4:
+      // 4-byte copy using 32-bit load/store
+      Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRWui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 8:
+      // 8-byte copy using 64-bit load/store
+      Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
+                            .addReg(AArch64::X3)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRXui)
+                            .addReg(AArch64::X3)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 16:
+      // 16-byte copy using 128-bit SIMD
+      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+                            .addReg(AArch64::Q0)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+                            .addReg(AArch64::Q0)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 32:
+      // 32-byte copy using two 128-bit SIMD operations
+      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+                            .addReg(AArch64::Q0)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+                            .addReg(AArch64::Q0)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+                            .addReg(AArch64::Q1)
+                            .addReg(AArch64::X1)
+                            .addImm(1));
+      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+                            .addReg(AArch64::Q1)
+                            .addReg(AArch64::X0)
+                            .addImm(1));
+      break;
+
+    default:
+      if (Size <= 64) {
+        // For sizes up to 64 bytes, greedily use the largest possible loads in
+        // descending order
+        uint64_t Remaining = Size;
+        uint64_t Offset = 0;
+
+        while (Remaining >= 16) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+                                .addReg(AArch64::Q0)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset / 16));
+          Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+                                .addReg(AArch64::Q0)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset / 16));
+          Remaining -= 16;
+          Offset += 16;
+        }
+        if (Remaining >= 8) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
+                                .addReg(AArch64::X3)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset / 8));
+          Code.emplace_back(MCInstBuilder(AArch64::STRXui)
+                                .addReg(AArch64::X3)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset / 8));
+          Remaining -= 8;
+          Offset += 8;
+        }
+        if (Remaining >= 4) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset / 4));
+          Code.emplace_back(MCInstBuilder(AArch64::STRWui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset / 4));
+          Remaining -= 4;
+          Offset += 4;
+        }
+        if (Remaining >= 2) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset / 2));
+          Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset / 2));
+          Remaining -= 2;
+          Offset += 2;
+        }
+        if (Remaining == 1) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset));
+          Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset));
+        }
+      } else {
+        Code.clear();
+      }
+      break;
+    }
+    return Code;
+  }
 };
 
 } // end anonymous namespace

>From 2e5b22b501a83796ff10ae30520e07cb44b21332 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:14:11 -0700
Subject: [PATCH 04/17] test target fix for CI cross-compilation issue

---
 bolt/test/AArch64/inline-memcpy.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
index 3bb498e600fb6..e46308286e07b 100644
--- a/bolt/test/AArch64/inline-memcpy.s
+++ b/bolt/test/AArch64/inline-memcpy.s
@@ -1,6 +1,6 @@
 ## This test checks that BOLT correctly inlines memcpy calls on AArch64.
 
-# REQUIRES: system-linux
+# REQUIRES: system-linux, aarch64-registered-target
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q  

>From 385fa23691e05fbdb6ffb24cc6a9526ff8d08020 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:49:37 -0700
Subject: [PATCH 05/17] moved inline-memcpy to avoid CI cross-compilation PIE
 conflicts

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 bolt/test/runtime/AArch64/inline-memcpy.s

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
new file mode 100644
index 0000000000000..0e16b6a7e963f
--- /dev/null
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -0,0 +1,193 @@
+## This test checks that BOLT correctly inlines memcpy calls on AArch64.
+
+# REQUIRES: system-linux, aarch64-registered-target
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q 
+# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
+
+# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+
+# Each function should use optimal size-specific instructions and NO memcpy calls
+
+# 1-byte copy should use single byte load/store (ldrb/strb)
+# CHECK-ASM-LABEL: <test_1_byte_direct>:
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 2-byte copy should use single 16-bit load/store (ldrh/strh)
+# CHECK-ASM-LABEL: <test_2_byte_direct>:
+# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 4-byte copy should use single 32-bit load/store (w register)
+# CHECK-ASM-LABEL: <test_4_byte_direct>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 8-byte copy should use single 64-bit load/store (x register)
+# CHECK-ASM-LABEL: <test_8_byte_direct>:
+# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 16-byte copy should use single 128-bit SIMD load/store (q register)
+# CHECK-ASM-LABEL: <test_16_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 32-byte copy should use two 128-bit SIMD operations
+# CHECK-ASM-LABEL: <test_32_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
+# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# CHECK-ASM-LABEL: <test_128_byte_too_large>:
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+
+	.text
+	.globl	test_1_byte_direct                
+	.type	test_1_byte_direct, at function
+test_1_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #1
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_1_byte_direct, .-test_1_byte_direct
+
+	.globl	test_2_byte_direct                
+	.type	test_2_byte_direct, at function
+test_2_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #2
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_2_byte_direct, .-test_2_byte_direct
+
+	.globl	test_4_byte_direct                
+	.type	test_4_byte_direct, at function
+test_4_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #4
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_4_byte_direct, .-test_4_byte_direct
+
+	.globl	test_8_byte_direct                
+	.type	test_8_byte_direct, at function
+test_8_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #8
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_8_byte_direct, .-test_8_byte_direct
+
+	.globl	test_16_byte_direct
+	.type	test_16_byte_direct, at function
+test_16_byte_direct:
+	stp	x29, x30, [sp, #-48]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #32
+	mov	x2, #16
+	bl	memcpy
+	ldp	x29, x30, [sp], #48
+	ret
+	.size	test_16_byte_direct, .-test_16_byte_direct
+
+	.globl	test_32_byte_direct
+	.type	test_32_byte_direct, at function
+test_32_byte_direct:
+	stp	x29, x30, [sp, #-80]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #48
+	mov	x2, #32
+	bl	memcpy
+	ldp	x29, x30, [sp], #80
+	ret
+	.size	test_32_byte_direct, .-test_32_byte_direct
+
+	.globl	test_37_byte_arbitrary
+	.type	test_37_byte_arbitrary, at function
+test_37_byte_arbitrary:
+	stp	x29, x30, [sp, #-96]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #56
+	mov	x2, #37
+	bl	memcpy
+	ldp	x29, x30, [sp], #96
+	ret
+	.size	test_37_byte_arbitrary, .-test_37_byte_arbitrary
+
+	.globl	test_128_byte_too_large
+	.type	test_128_byte_too_large, at function
+test_128_byte_too_large:
+	stp	x29, x30, [sp, #-288]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #152
+	mov	x2, #128
+	bl	memcpy
+	ldp	x29, x30, [sp], #288
+	ret
+	.size	test_128_byte_too_large, .-test_128_byte_too_large
+
+	.globl	main
+	.type	main, at function
+main:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	
+	bl	test_1_byte_direct
+	bl	test_2_byte_direct
+	bl	test_4_byte_direct
+	bl	test_8_byte_direct
+	bl	test_16_byte_direct  
+	bl	test_32_byte_direct
+	bl	test_37_byte_arbitrary
+	bl	test_128_byte_too_large
+	
+	mov	w0, #0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	main, .-main

>From 4f9ef678f0d07e23a362cf28805749d53bc8b0b5 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:56:47 -0700
Subject: [PATCH 06/17] removed old test

---
 bolt/test/AArch64/inline-memcpy.s | 193 ------------------------------
 1 file changed, 193 deletions(-)
 delete mode 100644 bolt/test/AArch64/inline-memcpy.s

diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
deleted file mode 100644
index e46308286e07b..0000000000000
--- a/bolt/test/AArch64/inline-memcpy.s
+++ /dev/null
@@ -1,193 +0,0 @@
-## This test checks that BOLT correctly inlines memcpy calls on AArch64.
-
-# REQUIRES: system-linux, aarch64-registered-target
-
-# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
-# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q  
-# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
-# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-
-# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
-# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
-
-# Each function should use optimal size-specific instructions and NO memcpy calls
-
-# 1-byte copy should use single byte load/store (ldrb/strb)
-# CHECK-ASM-LABEL: <test_1_byte_direct>:
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 2-byte copy should use single 16-bit load/store (ldrh/strh)
-# CHECK-ASM-LABEL: <test_2_byte_direct>:
-# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 4-byte copy should use single 32-bit load/store (w register)
-# CHECK-ASM-LABEL: <test_4_byte_direct>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 8-byte copy should use single 64-bit load/store (x register)
-# CHECK-ASM-LABEL: <test_8_byte_direct>:
-# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 16-byte copy should use single 128-bit SIMD load/store (q register)
-# CHECK-ASM-LABEL: <test_16_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 32-byte copy should use two 128-bit SIMD operations
-# CHECK-ASM-LABEL: <test_32_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
-# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
-# CHECK-ASM-LABEL: <test_128_byte_too_large>:
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
-
-	.text
-	.globl	test_1_byte_direct                
-	.type	test_1_byte_direct, at function
-test_1_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #8  
-	mov	x2, #1
-	bl	memcpy
-	ldp	x29, x30, [sp], #32
-	ret
-	.size	test_1_byte_direct, .-test_1_byte_direct
-
-	.globl	test_2_byte_direct                
-	.type	test_2_byte_direct, at function
-test_2_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #8  
-	mov	x2, #2
-	bl	memcpy
-	ldp	x29, x30, [sp], #32
-	ret
-	.size	test_2_byte_direct, .-test_2_byte_direct
-
-	.globl	test_4_byte_direct                
-	.type	test_4_byte_direct, at function
-test_4_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #8  
-	mov	x2, #4
-	bl	memcpy
-	ldp	x29, x30, [sp], #32
-	ret
-	.size	test_4_byte_direct, .-test_4_byte_direct
-
-	.globl	test_8_byte_direct                
-	.type	test_8_byte_direct, at function
-test_8_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #8  
-	mov	x2, #8
-	bl	memcpy
-	ldp	x29, x30, [sp], #32
-	ret
-	.size	test_8_byte_direct, .-test_8_byte_direct
-
-	.globl	test_16_byte_direct
-	.type	test_16_byte_direct, at function
-test_16_byte_direct:
-	stp	x29, x30, [sp, #-48]!
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #32
-	mov	x2, #16
-	bl	memcpy
-	ldp	x29, x30, [sp], #48
-	ret
-	.size	test_16_byte_direct, .-test_16_byte_direct
-
-	.globl	test_32_byte_direct
-	.type	test_32_byte_direct, at function
-test_32_byte_direct:
-	stp	x29, x30, [sp, #-80]!
-	mov	x29, sp
-	add	x1, sp, #16  
-	add	x0, sp, #48
-	mov	x2, #32
-	bl	memcpy
-	ldp	x29, x30, [sp], #80
-	ret
-	.size	test_32_byte_direct, .-test_32_byte_direct
-
-	.globl	test_37_byte_arbitrary
-	.type	test_37_byte_arbitrary, at function
-test_37_byte_arbitrary:
-	stp	x29, x30, [sp, #-96]!
-	mov	x29, sp
-	add	x1, sp, #16  
-	add	x0, sp, #56
-	mov	x2, #37
-	bl	memcpy
-	ldp	x29, x30, [sp], #96
-	ret
-	.size	test_37_byte_arbitrary, .-test_37_byte_arbitrary
-
-	.globl	test_128_byte_too_large
-	.type	test_128_byte_too_large, at function
-test_128_byte_too_large:
-	stp	x29, x30, [sp, #-288]!
-	mov	x29, sp
-	add	x1, sp, #16  
-	add	x0, sp, #152
-	mov	x2, #128
-	bl	memcpy
-	ldp	x29, x30, [sp], #288
-	ret
-	.size	test_128_byte_too_large, .-test_128_byte_too_large
-
-	.globl	main
-	.type	main, at function
-main:
-	stp	x29, x30, [sp, #-16]!
-	mov	x29, sp
-	
-	bl	test_1_byte_direct
-	bl	test_2_byte_direct
-	bl	test_4_byte_direct
-	bl	test_8_byte_direct
-	bl	test_16_byte_direct  
-	bl	test_32_byte_direct
-	bl	test_37_byte_arbitrary
-	bl	test_128_byte_too_large
-	
-	mov	w0, #0
-	ldp	x29, x30, [sp], #16
-	ret
-	.size	main, .-main

>From e83126edd3dd418086f8341a92609210ba7cb874 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 08:51:18 -0700
Subject: [PATCH 07/17] response to review

---
 bolt/lib/Passes/BinaryPasses.cpp              |  37 +++--
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 140 ++++--------------
 2 files changed, 49 insertions(+), 128 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 0068c1ad0bf1c..e532c2aa0422d 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1866,26 +1866,25 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
         const bool IsTailCall = BC.MIB->isTailCall(Inst);
 
-        // Extract the size of thecopy from preceding instructions by looking
-        // for writes to the size register
+        // Extract size from preceding instructions (AArch64 only)
+        // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2
         std::optional<uint64_t> KnownSize = std::nullopt;
-        BitVector WrittenRegs(BC.MRI->getNumRegs());
-
-        // Get the size register (3rd arg register, index 2 for AArch64)
-        MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
-
-        // Look backwards through the basic block for size-setting instr
-        for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
-          MCInst &Inst = *InstIt;
-          WrittenRegs.reset(); // Clear and check what the instruction writes to
-          BC.MIB->getWrittenRegs(Inst, WrittenRegs);
-
-          // Check for writes to the size register
-          if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
-            if (std::optional<uint64_t> ExtractedSize =
-                    BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
-              KnownSize = *ExtractedSize;
-              break;
+        if (BC.isAArch64()) {
+          BitVector WrittenRegs(BC.MRI->getNumRegs());
+          MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+
+          // Look backwards for size-setting instruction
+          for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
+            MCInst &Inst = *InstIt;
+            WrittenRegs.reset();
+            BC.MIB->getWrittenRegs(Inst, WrittenRegs);
+
+            if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
+              if (std::optional<uint64_t> ExtractedSize =
+                      BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
+                KnownSize = *ExtractedSize;
+                break;
+              }
             }
           }
         }
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 03f62117ea096..e640044ec762d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2647,152 +2647,74 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
                                                  uint64_t Size) const {
+    // Helper to add load/store pair
+    auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
+                                unsigned Reg, unsigned Offset = 0) {
+      Code.emplace_back(MCInstBuilder(LoadOpc)
+                            .addReg(Reg)
+                            .addReg(AArch64::X1)
+                            .addImm(Offset));
+      Code.emplace_back(MCInstBuilder(StoreOpc)
+                            .addReg(Reg)
+                            .addReg(AArch64::X0)
+                            .addImm(Offset));
+    };
+
     // Generate optimal instruction sequences based on exact size
     switch (Size) {
     case 1:
-      // Single byte copy
-      Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
       break;
-
     case 2:
-      // 2-byte copy using 16-bit load/store
-      Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
       break;
-
     case 4:
-      // 4-byte copy using 32-bit load/store
-      Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRWui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
       break;
-
     case 8:
-      // 8-byte copy using 64-bit load/store
-      Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
-                            .addReg(AArch64::X3)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRXui)
-                            .addReg(AArch64::X3)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
       break;
-
     case 16:
-      // 16-byte copy using 128-bit SIMD
-      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
-                            .addReg(AArch64::Q0)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
-                            .addReg(AArch64::Q0)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
       break;
-
     case 32:
-      // 32-byte copy using two 128-bit SIMD operations
-      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
-                            .addReg(AArch64::Q0)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
-                            .addReg(AArch64::Q0)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
-                            .addReg(AArch64::Q1)
-                            .addReg(AArch64::X1)
-                            .addImm(1));
-      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
-                            .addReg(AArch64::Q1)
-                            .addReg(AArch64::X0)
-                            .addImm(1));
+      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
+      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
       break;
 
     default:
       if (Size <= 64) {
-        // For sizes up to 64 bytes, greedily use the largest possible loads in
-        // descending order
+        // For sizes up to 64 bytes, greedily use the largest possible loads
         uint64_t Remaining = Size;
         uint64_t Offset = 0;
 
         while (Remaining >= 16) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
-                                .addReg(AArch64::Q0)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset / 16));
-          Code.emplace_back(MCInstBuilder(AArch64::STRQui)
-                                .addReg(AArch64::Q0)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset / 16));
+          addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0,
+                           Offset / 16);
           Remaining -= 16;
           Offset += 16;
         }
         if (Remaining >= 8) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
-                                .addReg(AArch64::X3)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset / 8));
-          Code.emplace_back(MCInstBuilder(AArch64::STRXui)
-                                .addReg(AArch64::X3)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset / 8));
+          addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3,
+                           Offset / 8);
           Remaining -= 8;
           Offset += 8;
         }
         if (Remaining >= 4) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset / 4));
-          Code.emplace_back(MCInstBuilder(AArch64::STRWui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset / 4));
+          addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3,
+                           Offset / 4);
           Remaining -= 4;
           Offset += 4;
         }
         if (Remaining >= 2) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset / 2));
-          Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset / 2));
+          addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3,
+                           Offset / 2);
           Remaining -= 2;
           Offset += 2;
         }
         if (Remaining == 1) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset));
-          Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset));
+          addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
+                           Offset);
         }
       } else {
         Code.clear();

>From cf8279a8b5081eec657a1f835c54470653186787 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 03:57:43 -0700
Subject: [PATCH 08/17] Update conditional formatting and move check for size
 into binaryPasses

---
 bolt/lib/Passes/BinaryPasses.cpp                 |  5 +++++
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 13 ++++---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e532c2aa0422d..1aade44286052 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1889,6 +1889,11 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
           }
         }
 
+        if (BC.isAArch64() && !KnownSize.has_value()) {
+          ++II;
+          continue;
+        }
+
         const InstructionListType NewCode =
             BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
         II = BB.replaceInstruction(II, NewCode);
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index e640044ec762d..9d30fdface0c5 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2621,24 +2621,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
                      std::optional<uint64_t> KnownSize) const override {
     InstructionListType Code;
     if (ReturnEnd) {
-      if (KnownSize.has_value() && (*KnownSize >> 12) == 0) {
-        // Use immediate if size is known and fits in 12-bit immediate (0-4095)
+      // Use immediate if size fits in 12-bit immediate (0-4095)
+      // Otherwise, fall back to register add for large sizes
+      if ((*KnownSize >> 12) == 0)
         Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
                               .addReg(AArch64::X0)
                               .addReg(AArch64::X0)
                               .addImm(*KnownSize)
                               .addImm(0));
-      } else {
-        // Fall back to register add for unknown or large sizes
+      else
         Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
                               .addReg(AArch64::X0)
                               .addReg(AArch64::X0)
                               .addReg(AArch64::X2));
-      }
-    }
-
-    if (!KnownSize.has_value()) {
-      return Code;
     }
 
     uint64_t Size = *KnownSize;

>From c317eb0cbd62ac6f164cf44b75d40e082167ce3d Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 04:55:17 -0700
Subject: [PATCH 09/17] Negative Tests (live-in, register move, non-mov
 instruction)

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 61 ++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 0e16b6a7e963f..417b444f6a4bb 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,7 +7,7 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls)
 # CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
@@ -67,6 +67,18 @@
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 # CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
 
+# ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain)
+# CHECK-ASM-LABEL: <test_4_byte_add_immediate>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Register move should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_register_move_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Live-in parameter should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_live_in_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
 	.text
 	.globl	test_1_byte_direct                
 	.type	test_1_byte_direct, at function
@@ -172,6 +184,50 @@ test_128_byte_too_large:
 	ret
 	.size	test_128_byte_too_large, .-test_128_byte_too_large
 
+	.globl	test_4_byte_add_immediate
+	.type	test_4_byte_add_immediate, at function
+test_4_byte_add_immediate:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x3, #0
+	add	x2, x3, #4
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_4_byte_add_immediate, .-test_4_byte_add_immediate
+
+	.globl	test_register_move_negative
+	.type	test_register_move_negative, at function
+test_register_move_negative:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x6, #4
+	mov	x2, x6
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_register_move_negative, .-test_register_move_negative
+
+	.globl	test_live_in_negative
+	.type	test_live_in_negative, at function
+test_live_in_negative:
+	# x2 comes in as parameter, no instruction sets it (should NOT inline)
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	# x2 is live-in, no size-setting instruction
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_live_in_negative, .-test_live_in_negative
+
+
+
 	.globl	main
 	.type	main, at function
 main:
@@ -186,6 +242,9 @@ main:
 	bl	test_32_byte_direct
 	bl	test_37_byte_arbitrary
 	bl	test_128_byte_too_large
+	bl	test_4_byte_add_immediate
+	bl	test_register_move_negative
+	bl	test_live_in_negative
 	
 	mov	w0, #0
 	ldp	x29, x30, [sp], #16

>From df97d61befcc9ceaf3d82648a1b68b88cc3e0451 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 06:51:08 -0700
Subject: [PATCH 10/17] memcpy8 redundant handling removed

---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 29 ++++++---------
 bolt/test/runtime/AArch64/inline-memcpy.s     | 37 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9d30fdface0c5..366d4183bca51 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2620,24 +2620,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   createInlineMemcpy(bool ReturnEnd,
                      std::optional<uint64_t> KnownSize) const override {
     InstructionListType Code;
-    if (ReturnEnd) {
-      // Use immediate if size fits in 12-bit immediate (0-4095)
-      // Otherwise, fall back to register add for large sizes
-      if ((*KnownSize >> 12) == 0)
-        Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
-                              .addReg(AArch64::X0)
-                              .addReg(AArch64::X0)
-                              .addImm(*KnownSize)
-                              .addImm(0));
-      else
-        Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
-                              .addReg(AArch64::X0)
-                              .addReg(AArch64::X0)
-                              .addReg(AArch64::X2));
-    }
-
     uint64_t Size = *KnownSize;
-    return generateSizeSpecificMemcpy(Code, Size);
+
+    // Generate the optimized memcpy sequence
+    generateSizeSpecificMemcpy(Code, Size);
+
+    // If _memcpy8, adjust X0 to return dest+size instead of dest
+    if (ReturnEnd)
+      Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
+                            .addReg(AArch64::X0)
+                            .addReg(AArch64::X0)
+                            .addImm(Size)
+                            .addImm(0));
+    return Code;
   }
 
   InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 417b444f6a4bb..961e21f82851d 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
 
@@ -79,6 +79,13 @@
 # CHECK-ASM-LABEL: <test_live_in_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
+# _memcpy8 should be inlined with end-pointer return (dest+size)
+# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: add{{.*}}x0, x0, #0x4
+# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
+
 	.text
 	.globl	test_1_byte_direct                
 	.type	test_1_byte_direct, at function
@@ -226,7 +233,31 @@ test_live_in_negative:
 	ret
 	.size	test_live_in_negative, .-test_live_in_negative
 
+	.globl	test_memcpy8_4_byte
+	.type	test_memcpy8_4_byte, at function
+test_memcpy8_4_byte:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #4
+	bl	_memcpy8
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_memcpy8_4_byte, .-test_memcpy8_4_byte
 
+	# Simple _memcpy8 implementation that calls memcpy and returns dest+size
+	.globl	_memcpy8
+	.type	_memcpy8, at function
+_memcpy8:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	mov	x3, x0
+	bl	memcpy
+	add	x0, x3, x2
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	_memcpy8, .-_memcpy8
 
 	.globl	main
 	.type	main, at function
@@ -245,6 +276,8 @@ main:
 	bl	test_4_byte_add_immediate
 	bl	test_register_move_negative
 	bl	test_live_in_negative
+	bl	test_memcpy8_4_byte
+	bl	test_memcpy8_large_size
 	
 	mov	w0, #0
 	ldp	x29, x30, [sp], #16

>From 25cfb58b165fd1190f9b1b52cce1423d2db5d3c1 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 06:54:14 -0700
Subject: [PATCH 11/17] nit: comment clean up

---
 bolt/lib/Passes/BinaryPasses.cpp                 | 6 +++---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 1aade44286052..e8124dd3cb4f4 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1866,14 +1866,14 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
         const bool IsTailCall = BC.MIB->isTailCall(Inst);
 
-        // Extract size from preceding instructions (AArch64 only)
-        // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2
+        // Extract size from preceding instructions (AArch64 only).
+        // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
         std::optional<uint64_t> KnownSize = std::nullopt;
         if (BC.isAArch64()) {
           BitVector WrittenRegs(BC.MRI->getNumRegs());
           MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
 
-          // Look backwards for size-setting instruction
+          // Look backwards for size-setting instruction.
           for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
             MCInst &Inst = *InstIt;
             WrittenRegs.reset();
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 366d4183bca51..67febc2324e14 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2622,10 +2622,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     InstructionListType Code;
     uint64_t Size = *KnownSize;
 
-    // Generate the optimized memcpy sequence
+    // Generate the optimized memcpy sequence.
     generateSizeSpecificMemcpy(Code, Size);
 
-    // If _memcpy8, adjust X0 to return dest+size instead of dest
+    // If _memcpy8, adjust X0 to return dest+size instead of dest.
     if (ReturnEnd)
       Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
                             .addReg(AArch64::X0)
@@ -2637,7 +2637,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
                                                  uint64_t Size) const {
-    // Helper to add load/store pair
     auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
                                 unsigned Reg, unsigned Offset = 0) {
       Code.emplace_back(MCInstBuilder(LoadOpc)
@@ -2650,7 +2649,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
                             .addImm(Offset));
     };
 
-    // Generate optimal instruction sequences based on exact size
+    // Generate optimal instruction sequences based on exact size.
     switch (Size) {
     case 1:
       addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
@@ -2674,7 +2673,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
     default:
       if (Size <= 64) {
-        // For sizes up to 64 bytes, greedily use the largest possible loads
+        // For sizes up to 64 bytes, greedily use the largest possible loads.
         uint64_t Remaining = Size;
         uint64_t Offset = 0;
 

>From e308855758965504cca82484f66065d186c64093 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 02:12:27 -0700
Subject: [PATCH 12/17] minor refactor

---
 bolt/lib/Passes/BinaryPasses.cpp              | 11 +++++-----
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 22 +++++++------------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e8124dd3cb4f4..022d06ae80e7b 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1872,6 +1872,7 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         if (BC.isAArch64()) {
           BitVector WrittenRegs(BC.MRI->getNumRegs());
           MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+          std::optional<uint64_t> ExtractedSize;
 
           // Look backwards for size-setting instruction.
           for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
@@ -1879,12 +1880,10 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
             WrittenRegs.reset();
             BC.MIB->getWrittenRegs(Inst, WrittenRegs);
 
-            if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
-              if (std::optional<uint64_t> ExtractedSize =
-                      BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
-                KnownSize = *ExtractedSize;
-                break;
-              }
+            if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] &&
+                (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) {
+              KnownSize = *ExtractedSize;
+              break;
             }
           }
         }
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 67febc2324e14..dfb5fe3cfe30d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2599,20 +2599,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   }
 
   InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
-    // Fallback
     return createInlineMemcpy(ReturnEnd, std::nullopt);
   }
 
   std::optional<uint64_t>
   extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
-    if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) {
-      if (Inst.getOperand(0).isReg() &&
-          Inst.getOperand(0).getReg() == TargetReg &&
-          Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
-          Inst.getOperand(2).getImm() == 0) {
-        return Inst.getOperand(1).getImm();
-      }
-    }
+    if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 &&
+        Inst.getOperand(0).isReg() &&
+        Inst.getOperand(0).getReg() == TargetReg &&
+        Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
+        Inst.getOperand(2).getImm() == 0)
+      return Inst.getOperand(1).getImm();
     return std::nullopt;
   }
 
@@ -2622,7 +2619,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     InstructionListType Code;
     uint64_t Size = *KnownSize;
 
-    // Generate the optimized memcpy sequence.
     generateSizeSpecificMemcpy(Code, Size);
 
     // If _memcpy8, adjust X0 to return dest+size instead of dest.
@@ -2701,13 +2697,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
           Remaining -= 2;
           Offset += 2;
         }
-        if (Remaining == 1) {
+        if (Remaining == 1)
           addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
                            Offset);
-        }
-      } else {
+      } else
         Code.clear();
-      }
       break;
     }
     return Code;

>From 365a0bfaa0d68e9a5c45f9b5163af49ca6d5c1b8 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 06:33:39 -0700
Subject: [PATCH 13/17] NFC: Post-review refactor

---
 bolt/include/bolt/Core/MCPlusBuilder.h        | 10 +++
 bolt/lib/Passes/BinaryPasses.cpp              | 21 +----
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 81 ++++++++++---------
 3 files changed, 55 insertions(+), 57 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 6cbf288f3b8f4..3192472f5fbe0 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -14,6 +14,7 @@
 #ifndef BOLT_CORE_MCPLUSBUILDER_H
 #define BOLT_CORE_MCPLUSBUILDER_H
 
+#include "bolt/Core/BinaryBasicBlock.h"
 #include "bolt/Core/MCPlus.h"
 #include "bolt/Core/Relocation.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -1888,6 +1889,15 @@ class MCPlusBuilder {
     return {};
   }
 
+  /// Find memcpy size in bytes by using preceding instructions.
+  /// Returns std::nullopt if size cannot be determined (no-op for most
+  /// targets).
+  virtual std::optional<uint64_t>
+  findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+                        BinaryBasicBlock::iterator CallInst) const {
+    return std::nullopt;
+  }
+
   /// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
   /// (dest + n) instead of dest.
   virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 022d06ae80e7b..f1807f6eb997e 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1868,25 +1868,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
 
         // Extract size from preceding instructions (AArch64 only).
         // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
-        std::optional<uint64_t> KnownSize = std::nullopt;
-        if (BC.isAArch64()) {
-          BitVector WrittenRegs(BC.MRI->getNumRegs());
-          MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
-          std::optional<uint64_t> ExtractedSize;
-
-          // Look backwards for size-setting instruction.
-          for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
-            MCInst &Inst = *InstIt;
-            WrittenRegs.reset();
-            BC.MIB->getWrittenRegs(Inst, WrittenRegs);
-
-            if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] &&
-                (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) {
-              KnownSize = *ExtractedSize;
-              break;
-            }
-          }
-        }
+        std::optional<uint64_t> KnownSize =
+            BC.MIB->findMemcpySizeInBytes(BB, II);
 
         if (BC.isAArch64() && !KnownSize.has_value()) {
           ++II;
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index dfb5fe3cfe30d..6f539b8588f2e 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2604,15 +2604,33 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   std::optional<uint64_t>
   extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
-    if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 &&
-        Inst.getOperand(0).isReg() &&
+    // Match MOVZXi with the target register and no shift.
+    if (Inst.getOpcode() == AArch64::MOVZXi &&
         Inst.getOperand(0).getReg() == TargetReg &&
-        Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
         Inst.getOperand(2).getImm() == 0)
       return Inst.getOperand(1).getImm();
     return std::nullopt;
   }
 
+  std::optional<uint64_t>
+  findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+                        BinaryBasicBlock::iterator CallInst) const override {
+    BitVector WrittenRegs(RegInfo->getNumRegs());
+    MCPhysReg SizeReg = getIntArgRegister(2);
+    std::optional<uint64_t> ExtractedSize;
+
+    for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
+      const MCInst &Inst = *InstIt;
+      WrittenRegs.reset();
+      getWrittenRegs(Inst, WrittenRegs);
+
+      if (SizeReg != getNoRegister() && WrittenRegs[SizeReg] &&
+          (ExtractedSize = extractMoveImmediate(Inst, SizeReg)))
+        return *ExtractedSize;
+    }
+    return std::nullopt;
+  }
+
   InstructionListType
   createInlineMemcpy(bool ReturnEnd,
                      std::optional<uint64_t> KnownSize) const override {
@@ -2633,7 +2651,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
                                                  uint64_t Size) const {
-    auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
+    auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
                                 unsigned Reg, unsigned Offset = 0) {
       Code.emplace_back(MCInstBuilder(LoadOpc)
                             .addReg(Reg)
@@ -2648,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     // Generate optimal instruction sequences based on exact size.
     switch (Size) {
     case 1:
-      addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
       break;
     case 2:
-      addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
       break;
     case 4:
-      addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
       break;
     case 8:
-      addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
+      AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
       break;
     case 16:
-      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
       break;
     case 32:
-      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
-      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
       break;
 
     default:
@@ -2673,33 +2691,20 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
         uint64_t Remaining = Size;
         uint64_t Offset = 0;
 
-        while (Remaining >= 16) {
-          addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0,
-                           Offset / 16);
-          Remaining -= 16;
-          Offset += 16;
-        }
-        if (Remaining >= 8) {
-          addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3,
-                           Offset / 8);
-          Remaining -= 8;
-          Offset += 8;
-        }
-        if (Remaining >= 4) {
-          addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3,
-                           Offset / 4);
-          Remaining -= 4;
-          Offset += 4;
-        }
-        if (Remaining >= 2) {
-          addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3,
-                           Offset / 2);
-          Remaining -= 2;
-          Offset += 2;
-        }
-        if (Remaining == 1)
-          addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
-                           Offset);
+        const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
+            LoadStoreOps = {
+                {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0},
+                 {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3},
+                 {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3},
+                 {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3},
+                 {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}};
+
+        for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
+          while (Remaining >= OpSize) {
+            AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
+            Remaining -= OpSize;
+            Offset += OpSize;
+          }
       } else
         Code.clear();
       break;

>From 84c904ac68b263b48227b3308ad16c795382b7c3 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 06:42:47 -0700
Subject: [PATCH 14/17] NFC: Test for corner case with size 0

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 25 ++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 961e21f82851d..3acb5e394d52d 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
 
@@ -62,6 +62,12 @@
 # CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
+# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
+# CHECK-ASM-LABEL: <test_0_byte>:
+# CHECK-ASM-NOT: ldr
+# CHECK-ASM-NOT: str  
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
 # 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
 # CHECK-ASM-LABEL: <test_128_byte_too_large>:
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
@@ -178,6 +184,19 @@ test_37_byte_arbitrary:
 	ret
 	.size	test_37_byte_arbitrary, .-test_37_byte_arbitrary
 
+	.globl	test_0_byte
+	.type	test_0_byte, at function
+test_0_byte:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #0
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_0_byte, .-test_0_byte
+
 	.globl	test_128_byte_too_large
 	.type	test_128_byte_too_large, at function
 test_128_byte_too_large:
@@ -272,12 +291,12 @@ main:
 	bl	test_16_byte_direct  
 	bl	test_32_byte_direct
 	bl	test_37_byte_arbitrary
+	bl	test_0_byte
 	bl	test_128_byte_too_large
 	bl	test_4_byte_add_immediate
 	bl	test_register_move_negative
 	bl	test_live_in_negative
 	bl	test_memcpy8_4_byte
-	bl	test_memcpy8_large_size
 	
 	mov	w0, #0
 	ldp	x29, x30, [sp], #16

>From 0561bccf755709811eed3d13e10bdcd2afa5fbe3 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:01:21 -0700
Subject: [PATCH 15/17] Use temp instead of argument registers

---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 6f539b8588f2e..f17a91bc3ba76 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2666,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     // Generate optimal instruction sequences based on exact size.
     switch (Size) {
     case 1:
-      AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
       break;
     case 2:
-      AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
       break;
     case 4:
-      AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
       break;
     case 8:
-      AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
+      AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
       break;
     case 16:
-      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
       break;
     case 32:
-      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
-      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
       break;
 
     default:
@@ -2693,11 +2693,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
         const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
             LoadStoreOps = {
-                {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0},
-                 {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3},
-                 {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3},
-                 {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3},
-                 {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}};
+                {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
+                 {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
+                 {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
+                 {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
+                 {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
 
         for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
           while (Remaining >= OpSize) {

>From cc49db79eea544305571e5e91caa3328c91cf4a7 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:01:54 -0700
Subject: [PATCH 16/17] Update early return

---
 bolt/lib/Passes/BinaryPasses.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index f1807f6eb997e..d40f5fb78c7f3 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1871,10 +1871,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         std::optional<uint64_t> KnownSize =
             BC.MIB->findMemcpySizeInBytes(BB, II);
 
-        if (BC.isAArch64() && !KnownSize.has_value()) {
-          ++II;
+        if (BC.isAArch64() && !KnownSize.has_value())
           continue;
-        }
 
         const InstructionListType NewCode =
             BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);

>From 115606be208c8b6675df59b9f231dd709ea863fd Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:02:48 -0700
Subject: [PATCH 17/17] Update tests to be more specific about registers +
 negative test on early return check

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 70 +++++++++++++++--------
 1 file changed, 45 insertions(+), 25 deletions(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 3acb5e394d52d..14a95d91dd189 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,59 +7,59 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls)
+# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 14 total calls)
 # CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
 
 # 1-byte copy should use single byte load/store (ldrb/strb)
 # CHECK-ASM-LABEL: <test_1_byte_direct>:
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldrb{{.*}}w9, [x1]
+# CHECK-ASM: strb{{.*}}w9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 2-byte copy should use single 16-bit load/store (ldrh/strh)
 # CHECK-ASM-LABEL: <test_2_byte_direct>:
-# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldrh{{.*}}w9, [x1]
+# CHECK-ASM: strh{{.*}}w9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 4-byte copy should use single 32-bit load/store (w register)
 # CHECK-ASM-LABEL: <test_4_byte_direct>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM: str{{.*}}w9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 8-byte copy should use single 64-bit load/store (x register)
 # CHECK-ASM-LABEL: <test_8_byte_direct>:
-# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}x9, [x1]
+# CHECK-ASM: str{{.*}}x9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 16-byte copy should use single 128-bit SIMD load/store (q register)
 # CHECK-ASM-LABEL: <test_16_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 32-byte copy should use two 128-bit SIMD operations
 # CHECK-ASM-LABEL: <test_32_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q17, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q17, [x0, #0x10]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
 # CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w9, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w9, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w9, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w9, [x0, #0x24]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 0-byte copy should be inlined with no load/store instructions (nothing to copy)
@@ -85,10 +85,14 @@
 # CHECK-ASM-LABEL: <test_live_in_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
+# Register-based size should NOT be inlined (isAArch64 & size unknown at compile time)
+# CHECK-ASM-LABEL: <test_register_size_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
 # _memcpy8 should be inlined with end-pointer return (dest+size)
 # CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM: str{{.*}}w9, [x0]
 # CHECK-ASM: add{{.*}}x0, x0, #0x4
 # CHECK-ASM-NOT: bl{{.*}}<_memcpy8
 
@@ -252,6 +256,21 @@ test_live_in_negative:
 	ret
 	.size	test_live_in_negative, .-test_live_in_negative
 
+	.globl	test_register_size_negative
+	.type	test_register_size_negative, at function
+test_register_size_negative:
+	# This would crash without isAArch64() check: size from register parameter
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x3, #4
+	mov	x2, x3
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_register_size_negative, .-test_register_size_negative
+
 	.globl	test_memcpy8_4_byte
 	.type	test_memcpy8_4_byte, at function
 test_memcpy8_4_byte:
@@ -296,6 +315,7 @@ main:
 	bl	test_4_byte_add_immediate
 	bl	test_register_move_negative
 	bl	test_live_in_negative
+	bl	test_register_size_negative
 	bl	test_memcpy8_4_byte
 	
 	mov	w0, #0