[llvm] [BOLT][AArch64] Enabling Inlining for Memcpy for AArch64 in BOLT (PR #154929)

Fri Sep 5 09:19:52 PDT 2025

https://github.com/yafet-a updated https://github.com/llvm/llvm-project/pull/154929

>From ce56f84aa7c86e1b35cf0ca4218a1f23702a206e Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 10:12:03 -0700
Subject: [PATCH 01/26] pre-commit test

---
 bolt/test/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 bolt/test/AArch64/inline-memcpy.s

diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
new file mode 100644
index 0000000000000..3bb498e600fb6
--- /dev/null
+++ b/bolt/test/AArch64/inline-memcpy.s
@@ -0,0 +1,193 @@
+## This test checks that BOLT correctly inlines memcpy calls on AArch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q  
+# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
+
+# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+
+# Each function should use optimal size-specific instructions and NO memcpy calls
+
+# 1-byte copy should use single byte load/store (ldrb/strb)
+# CHECK-ASM-LABEL: <test_1_byte_direct>:
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 2-byte copy should use single 16-bit load/store (ldrh/strh)
+# CHECK-ASM-LABEL: <test_2_byte_direct>:
+# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 4-byte copy should use single 32-bit load/store (w register)
+# CHECK-ASM-LABEL: <test_4_byte_direct>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 8-byte copy should use single 64-bit load/store (x register)
+# CHECK-ASM-LABEL: <test_8_byte_direct>:
+# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 16-byte copy should use single 128-bit SIMD load/store (q register)
+# CHECK-ASM-LABEL: <test_16_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 32-byte copy should use two 128-bit SIMD operations
+# CHECK-ASM-LABEL: <test_32_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
+# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# CHECK-ASM-LABEL: <test_128_byte_too_large>:
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+
+	.text
+	.globl	test_1_byte_direct                
+	.type	test_1_byte_direct, at function
+test_1_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #1
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_1_byte_direct, .-test_1_byte_direct
+
+	.globl	test_2_byte_direct                
+	.type	test_2_byte_direct, at function
+test_2_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #2
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_2_byte_direct, .-test_2_byte_direct
+
+	.globl	test_4_byte_direct                
+	.type	test_4_byte_direct, at function
+test_4_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #4
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_4_byte_direct, .-test_4_byte_direct
+
+	.globl	test_8_byte_direct                
+	.type	test_8_byte_direct, at function
+test_8_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #8
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_8_byte_direct, .-test_8_byte_direct
+
+	.globl	test_16_byte_direct
+	.type	test_16_byte_direct, at function
+test_16_byte_direct:
+	stp	x29, x30, [sp, #-48]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #32
+	mov	x2, #16
+	bl	memcpy
+	ldp	x29, x30, [sp], #48
+	ret
+	.size	test_16_byte_direct, .-test_16_byte_direct
+
+	.globl	test_32_byte_direct
+	.type	test_32_byte_direct, at function
+test_32_byte_direct:
+	stp	x29, x30, [sp, #-80]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #48
+	mov	x2, #32
+	bl	memcpy
+	ldp	x29, x30, [sp], #80
+	ret
+	.size	test_32_byte_direct, .-test_32_byte_direct
+
+	.globl	test_37_byte_arbitrary
+	.type	test_37_byte_arbitrary, at function
+test_37_byte_arbitrary:
+	stp	x29, x30, [sp, #-96]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #56
+	mov	x2, #37
+	bl	memcpy
+	ldp	x29, x30, [sp], #96
+	ret
+	.size	test_37_byte_arbitrary, .-test_37_byte_arbitrary
+
+	.globl	test_128_byte_too_large
+	.type	test_128_byte_too_large, at function
+test_128_byte_too_large:
+	stp	x29, x30, [sp, #-288]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #152
+	mov	x2, #128
+	bl	memcpy
+	ldp	x29, x30, [sp], #288
+	ret
+	.size	test_128_byte_too_large, .-test_128_byte_too_large
+
+	.globl	main
+	.type	main, at function
+main:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	
+	bl	test_1_byte_direct
+	bl	test_2_byte_direct
+	bl	test_4_byte_direct
+	bl	test_8_byte_direct
+	bl	test_16_byte_direct  
+	bl	test_32_byte_direct
+	bl	test_37_byte_arbitrary
+	bl	test_128_byte_too_large
+	
+	mov	w0, #0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	main, .-main

>From 1c27d8967a1938cea4e9bf3110362cb91d7b3bbb Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 10:17:40 -0700
Subject: [PATCH 02/26] [BOLT] documentation

---
 bolt/docs/CommandLineArgumentReference.md | 2 +-
 bolt/lib/Rewrite/BinaryPassManager.cpp    | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index f3881c9a640a9..3fc0594514f6e 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -631,7 +631,7 @@
 
 - `--inline-memcpy`
 
-  Inline memcpy using 'rep movsb' instruction (X86-only)
+  Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)
 
 - `--inline-small-functions`
 
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 996d2e972599d..6b554598cf1bc 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -247,7 +247,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),
 
 static cl::opt<bool> StringOps(
     "inline-memcpy",
-    cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
+    cl::desc(
+        "inline memcpy using size-specific optimized instructions "
+        "(X86: 'rep movsb', AArch64: width-optimized register operations)"),
     cl::cat(BoltOptCategory));
 
 static cl::opt<bool> StripRepRet(

>From db353b759b298aed2e0ebf86f99d6049a5a62e12 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 21 Aug 2025 11:25:05 -0700
Subject: [PATCH 03/26] [BOLT][AArch64] Implement safe size-aware memcpy
 inlining

---
 bolt/include/bolt/Core/MCPlusBuilder.h        |  16 ++
 bolt/lib/Passes/BinaryPasses.cpp              |  28 ++-
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 204 ++++++++++++++++++
 3 files changed, 246 insertions(+), 2 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index e773250ce8734..6cbf288f3b8f4 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1895,6 +1895,22 @@ class MCPlusBuilder {
     return {};
   }
 
+  /// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
+  /// generates optimized code for that specific size. Falls back to regular
+  /// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
+  virtual InstructionListType
+  createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
+    return createInlineMemcpy(ReturnEnd);
+  }
+
+  /// Extract immediate value from move instruction that sets the given
+  /// register. Returns the immediate value if the instruction is a
+  /// move-immediate to TargetReg.
+  virtual std::optional<uint64_t>
+  extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
+    return std::nullopt;
+  }
+
   /// Create a target-specific relocation out of the \p Fixup.
   /// Note that not every fixup could be converted into a relocation.
   virtual std::optional<Relocation>
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d7f02b9470030..0068c1ad0bf1c 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
 }
 
 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
-  if (!BC.isX86())
+  if (!BC.isX86() && !BC.isAArch64())
     return Error::success();
 
   uint64_t NumInlined = 0;
@@ -1866,8 +1866,32 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
         const bool IsTailCall = BC.MIB->isTailCall(Inst);
 
+        // Extract the size of thecopy from preceding instructions by looking
+        // for writes to the size register
+        std::optional<uint64_t> KnownSize = std::nullopt;
+        BitVector WrittenRegs(BC.MRI->getNumRegs());
+
+        // Get the size register (3rd arg register, index 2 for AArch64)
+        MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+
+        // Look backwards through the basic block for size-setting instr
+        for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
+          MCInst &Inst = *InstIt;
+          WrittenRegs.reset(); // Clear and check what the instruction writes to
+          BC.MIB->getWrittenRegs(Inst, WrittenRegs);
+
+          // Check for writes to the size register
+          if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
+            if (std::optional<uint64_t> ExtractedSize =
+                    BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
+              KnownSize = *ExtractedSize;
+              break;
+            }
+          }
+        }
+
         const InstructionListType NewCode =
-            BC.MIB->createInlineMemcpy(IsMemcpy8);
+            BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
         II = BB.replaceInstruction(II, NewCode);
         std::advance(II, NewCode.size() - 1);
         if (IsTailCall) {
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 973261765f951..03f62117ea096 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2597,6 +2597,210 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   getInstructionSize(const MCInst &Inst) const override {
     return 4;
   }
+
+  InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
+    // Fallback
+    return createInlineMemcpy(ReturnEnd, std::nullopt);
+  }
+
+  std::optional<uint64_t>
+  extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
+    if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) {
+      if (Inst.getOperand(0).isReg() &&
+          Inst.getOperand(0).getReg() == TargetReg &&
+          Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
+          Inst.getOperand(2).getImm() == 0) {
+        return Inst.getOperand(1).getImm();
+      }
+    }
+    return std::nullopt;
+  }
+
+  InstructionListType
+  createInlineMemcpy(bool ReturnEnd,
+                     std::optional<uint64_t> KnownSize) const override {
+    InstructionListType Code;
+    if (ReturnEnd) {
+      if (KnownSize.has_value() && (*KnownSize >> 12) == 0) {
+        // Use immediate if size is known and fits in 12-bit immediate (0-4095)
+        Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
+                              .addReg(AArch64::X0)
+                              .addReg(AArch64::X0)
+                              .addImm(*KnownSize)
+                              .addImm(0));
+      } else {
+        // Fall back to register add for unknown or large sizes
+        Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
+                              .addReg(AArch64::X0)
+                              .addReg(AArch64::X0)
+                              .addReg(AArch64::X2));
+      }
+    }
+
+    if (!KnownSize.has_value()) {
+      return Code;
+    }
+
+    uint64_t Size = *KnownSize;
+    return generateSizeSpecificMemcpy(Code, Size);
+  }
+
+  InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
+                                                 uint64_t Size) const {
+    // Generate optimal instruction sequences based on exact size
+    switch (Size) {
+    case 1:
+      // Single byte copy
+      Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 2:
+      // 2-byte copy using 16-bit load/store
+      Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 4:
+      // 4-byte copy using 32-bit load/store
+      Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRWui)
+                            .addReg(AArch64::W3)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 8:
+      // 8-byte copy using 64-bit load/store
+      Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
+                            .addReg(AArch64::X3)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRXui)
+                            .addReg(AArch64::X3)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 16:
+      // 16-byte copy using 128-bit SIMD
+      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+                            .addReg(AArch64::Q0)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+                            .addReg(AArch64::Q0)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      break;
+
+    case 32:
+      // 32-byte copy using two 128-bit SIMD operations
+      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+                            .addReg(AArch64::Q0)
+                            .addReg(AArch64::X1)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+                            .addReg(AArch64::Q0)
+                            .addReg(AArch64::X0)
+                            .addImm(0));
+      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+                            .addReg(AArch64::Q1)
+                            .addReg(AArch64::X1)
+                            .addImm(1));
+      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+                            .addReg(AArch64::Q1)
+                            .addReg(AArch64::X0)
+                            .addImm(1));
+      break;
+
+    default:
+      if (Size <= 64) {
+        // For sizes up to 64 bytes, greedily use the largest possible loads in
+        // descending order
+        uint64_t Remaining = Size;
+        uint64_t Offset = 0;
+
+        while (Remaining >= 16) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
+                                .addReg(AArch64::Q0)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset / 16));
+          Code.emplace_back(MCInstBuilder(AArch64::STRQui)
+                                .addReg(AArch64::Q0)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset / 16));
+          Remaining -= 16;
+          Offset += 16;
+        }
+        if (Remaining >= 8) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
+                                .addReg(AArch64::X3)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset / 8));
+          Code.emplace_back(MCInstBuilder(AArch64::STRXui)
+                                .addReg(AArch64::X3)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset / 8));
+          Remaining -= 8;
+          Offset += 8;
+        }
+        if (Remaining >= 4) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset / 4));
+          Code.emplace_back(MCInstBuilder(AArch64::STRWui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset / 4));
+          Remaining -= 4;
+          Offset += 4;
+        }
+        if (Remaining >= 2) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset / 2));
+          Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset / 2));
+          Remaining -= 2;
+          Offset += 2;
+        }
+        if (Remaining == 1) {
+          Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X1)
+                                .addImm(Offset));
+          Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
+                                .addReg(AArch64::W3)
+                                .addReg(AArch64::X0)
+                                .addImm(Offset));
+        }
+      } else {
+        Code.clear();
+      }
+      break;
+    }
+    return Code;
+  }
 };
 
 } // end anonymous namespace

>From 2e5b22b501a83796ff10ae30520e07cb44b21332 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:14:11 -0700
Subject: [PATCH 04/26] test target fix for CI cross-compilation issue

---
 bolt/test/AArch64/inline-memcpy.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
index 3bb498e600fb6..e46308286e07b 100644
--- a/bolt/test/AArch64/inline-memcpy.s
+++ b/bolt/test/AArch64/inline-memcpy.s
@@ -1,6 +1,6 @@
 ## This test checks that BOLT correctly inlines memcpy calls on AArch64.
 
-# REQUIRES: system-linux
+# REQUIRES: system-linux, aarch64-registered-target
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q  

>From 385fa23691e05fbdb6ffb24cc6a9526ff8d08020 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:49:37 -0700
Subject: [PATCH 05/26] moved inline-memcpy to avoid CI cross-compilation PIE
 conflicts

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 bolt/test/runtime/AArch64/inline-memcpy.s

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
new file mode 100644
index 0000000000000..0e16b6a7e963f
--- /dev/null
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -0,0 +1,193 @@
+## This test checks that BOLT correctly inlines memcpy calls on AArch64.
+
+# REQUIRES: system-linux, aarch64-registered-target
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q 
+# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
+
+# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+
+# Each function should use optimal size-specific instructions and NO memcpy calls
+
+# 1-byte copy should use single byte load/store (ldrb/strb)
+# CHECK-ASM-LABEL: <test_1_byte_direct>:
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 2-byte copy should use single 16-bit load/store (ldrh/strh)
+# CHECK-ASM-LABEL: <test_2_byte_direct>:
+# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 4-byte copy should use single 32-bit load/store (w register)
+# CHECK-ASM-LABEL: <test_4_byte_direct>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 8-byte copy should use single 64-bit load/store (x register)
+# CHECK-ASM-LABEL: <test_8_byte_direct>:
+# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 16-byte copy should use single 128-bit SIMD load/store (q register)
+# CHECK-ASM-LABEL: <test_16_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 32-byte copy should use two 128-bit SIMD operations
+# CHECK-ASM-LABEL: <test_32_byte_direct>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
+# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# CHECK-ASM-LABEL: <test_128_byte_too_large>:
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+
+	.text
+	.globl	test_1_byte_direct                
+	.type	test_1_byte_direct, at function
+test_1_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #1
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_1_byte_direct, .-test_1_byte_direct
+
+	.globl	test_2_byte_direct                
+	.type	test_2_byte_direct, at function
+test_2_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #2
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_2_byte_direct, .-test_2_byte_direct
+
+	.globl	test_4_byte_direct                
+	.type	test_4_byte_direct, at function
+test_4_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #4
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_4_byte_direct, .-test_4_byte_direct
+
+	.globl	test_8_byte_direct                
+	.type	test_8_byte_direct, at function
+test_8_byte_direct:                              
+	stp	x29, x30, [sp, #-32]!           
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8  
+	mov	x2, #8
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_8_byte_direct, .-test_8_byte_direct
+
+	.globl	test_16_byte_direct
+	.type	test_16_byte_direct, at function
+test_16_byte_direct:
+	stp	x29, x30, [sp, #-48]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #32
+	mov	x2, #16
+	bl	memcpy
+	ldp	x29, x30, [sp], #48
+	ret
+	.size	test_16_byte_direct, .-test_16_byte_direct
+
+	.globl	test_32_byte_direct
+	.type	test_32_byte_direct, at function
+test_32_byte_direct:
+	stp	x29, x30, [sp, #-80]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #48
+	mov	x2, #32
+	bl	memcpy
+	ldp	x29, x30, [sp], #80
+	ret
+	.size	test_32_byte_direct, .-test_32_byte_direct
+
+	.globl	test_37_byte_arbitrary
+	.type	test_37_byte_arbitrary, at function
+test_37_byte_arbitrary:
+	stp	x29, x30, [sp, #-96]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #56
+	mov	x2, #37
+	bl	memcpy
+	ldp	x29, x30, [sp], #96
+	ret
+	.size	test_37_byte_arbitrary, .-test_37_byte_arbitrary
+
+	.globl	test_128_byte_too_large
+	.type	test_128_byte_too_large, at function
+test_128_byte_too_large:
+	stp	x29, x30, [sp, #-288]!
+	mov	x29, sp
+	add	x1, sp, #16  
+	add	x0, sp, #152
+	mov	x2, #128
+	bl	memcpy
+	ldp	x29, x30, [sp], #288
+	ret
+	.size	test_128_byte_too_large, .-test_128_byte_too_large
+
+	.globl	main
+	.type	main, at function
+main:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	
+	bl	test_1_byte_direct
+	bl	test_2_byte_direct
+	bl	test_4_byte_direct
+	bl	test_8_byte_direct
+	bl	test_16_byte_direct  
+	bl	test_32_byte_direct
+	bl	test_37_byte_arbitrary
+	bl	test_128_byte_too_large
+	
+	mov	w0, #0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	main, .-main

>From 4f9ef678f0d07e23a362cf28805749d53bc8b0b5 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 05:56:47 -0700
Subject: [PATCH 06/26] removed old test

---
 bolt/test/AArch64/inline-memcpy.s | 193 ------------------------------
 1 file changed, 193 deletions(-)
 delete mode 100644 bolt/test/AArch64/inline-memcpy.s

diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s
deleted file mode 100644
index e46308286e07b..0000000000000
--- a/bolt/test/AArch64/inline-memcpy.s
+++ /dev/null
@@ -1,193 +0,0 @@
-## This test checks that BOLT correctly inlines memcpy calls on AArch64.
-
-# REQUIRES: system-linux, aarch64-registered-target
-
-# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
-# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q  
-# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
-# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
-
-# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
-# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
-
-# Each function should use optimal size-specific instructions and NO memcpy calls
-
-# 1-byte copy should use single byte load/store (ldrb/strb)
-# CHECK-ASM-LABEL: <test_1_byte_direct>:
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 2-byte copy should use single 16-bit load/store (ldrh/strh)
-# CHECK-ASM-LABEL: <test_2_byte_direct>:
-# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 4-byte copy should use single 32-bit load/store (w register)
-# CHECK-ASM-LABEL: <test_4_byte_direct>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 8-byte copy should use single 64-bit load/store (x register)
-# CHECK-ASM-LABEL: <test_8_byte_direct>:
-# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 16-byte copy should use single 128-bit SIMD load/store (q register)
-# CHECK-ASM-LABEL: <test_16_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 32-byte copy should use two 128-bit SIMD operations
-# CHECK-ASM-LABEL: <test_32_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
-# CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-
-# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
-# CHECK-ASM-LABEL: <test_128_byte_too_large>:
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
-
-	.text
-	.globl	test_1_byte_direct                
-	.type	test_1_byte_direct, at function
-test_1_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #8  
-	mov	x2, #1
-	bl	memcpy
-	ldp	x29, x30, [sp], #32
-	ret
-	.size	test_1_byte_direct, .-test_1_byte_direct
-
-	.globl	test_2_byte_direct                
-	.type	test_2_byte_direct, at function
-test_2_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #8  
-	mov	x2, #2
-	bl	memcpy
-	ldp	x29, x30, [sp], #32
-	ret
-	.size	test_2_byte_direct, .-test_2_byte_direct
-
-	.globl	test_4_byte_direct                
-	.type	test_4_byte_direct, at function
-test_4_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #8  
-	mov	x2, #4
-	bl	memcpy
-	ldp	x29, x30, [sp], #32
-	ret
-	.size	test_4_byte_direct, .-test_4_byte_direct
-
-	.globl	test_8_byte_direct                
-	.type	test_8_byte_direct, at function
-test_8_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #8  
-	mov	x2, #8
-	bl	memcpy
-	ldp	x29, x30, [sp], #32
-	ret
-	.size	test_8_byte_direct, .-test_8_byte_direct
-
-	.globl	test_16_byte_direct
-	.type	test_16_byte_direct, at function
-test_16_byte_direct:
-	stp	x29, x30, [sp, #-48]!
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #32
-	mov	x2, #16
-	bl	memcpy
-	ldp	x29, x30, [sp], #48
-	ret
-	.size	test_16_byte_direct, .-test_16_byte_direct
-
-	.globl	test_32_byte_direct
-	.type	test_32_byte_direct, at function
-test_32_byte_direct:
-	stp	x29, x30, [sp, #-80]!
-	mov	x29, sp
-	add	x1, sp, #16  
-	add	x0, sp, #48
-	mov	x2, #32
-	bl	memcpy
-	ldp	x29, x30, [sp], #80
-	ret
-	.size	test_32_byte_direct, .-test_32_byte_direct
-
-	.globl	test_37_byte_arbitrary
-	.type	test_37_byte_arbitrary, at function
-test_37_byte_arbitrary:
-	stp	x29, x30, [sp, #-96]!
-	mov	x29, sp
-	add	x1, sp, #16  
-	add	x0, sp, #56
-	mov	x2, #37
-	bl	memcpy
-	ldp	x29, x30, [sp], #96
-	ret
-	.size	test_37_byte_arbitrary, .-test_37_byte_arbitrary
-
-	.globl	test_128_byte_too_large
-	.type	test_128_byte_too_large, at function
-test_128_byte_too_large:
-	stp	x29, x30, [sp, #-288]!
-	mov	x29, sp
-	add	x1, sp, #16  
-	add	x0, sp, #152
-	mov	x2, #128
-	bl	memcpy
-	ldp	x29, x30, [sp], #288
-	ret
-	.size	test_128_byte_too_large, .-test_128_byte_too_large
-
-	.globl	main
-	.type	main, at function
-main:
-	stp	x29, x30, [sp, #-16]!
-	mov	x29, sp
-	
-	bl	test_1_byte_direct
-	bl	test_2_byte_direct
-	bl	test_4_byte_direct
-	bl	test_8_byte_direct
-	bl	test_16_byte_direct  
-	bl	test_32_byte_direct
-	bl	test_37_byte_arbitrary
-	bl	test_128_byte_too_large
-	
-	mov	w0, #0
-	ldp	x29, x30, [sp], #16
-	ret
-	.size	main, .-main

>From e83126edd3dd418086f8341a92609210ba7cb874 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 22 Aug 2025 08:51:18 -0700
Subject: [PATCH 07/26] response to review

---
 bolt/lib/Passes/BinaryPasses.cpp              |  37 +++--
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 140 ++++--------------
 2 files changed, 49 insertions(+), 128 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 0068c1ad0bf1c..e532c2aa0422d 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1866,26 +1866,25 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
         const bool IsTailCall = BC.MIB->isTailCall(Inst);
 
-        // Extract the size of thecopy from preceding instructions by looking
-        // for writes to the size register
+        // Extract size from preceding instructions (AArch64 only)
+        // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2
         std::optional<uint64_t> KnownSize = std::nullopt;
-        BitVector WrittenRegs(BC.MRI->getNumRegs());
-
-        // Get the size register (3rd arg register, index 2 for AArch64)
-        MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
-
-        // Look backwards through the basic block for size-setting instr
-        for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
-          MCInst &Inst = *InstIt;
-          WrittenRegs.reset(); // Clear and check what the instruction writes to
-          BC.MIB->getWrittenRegs(Inst, WrittenRegs);
-
-          // Check for writes to the size register
-          if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
-            if (std::optional<uint64_t> ExtractedSize =
-                    BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
-              KnownSize = *ExtractedSize;
-              break;
+        if (BC.isAArch64()) {
+          BitVector WrittenRegs(BC.MRI->getNumRegs());
+          MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+
+          // Look backwards for size-setting instruction
+          for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
+            MCInst &Inst = *InstIt;
+            WrittenRegs.reset();
+            BC.MIB->getWrittenRegs(Inst, WrittenRegs);
+
+            if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
+              if (std::optional<uint64_t> ExtractedSize =
+                      BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
+                KnownSize = *ExtractedSize;
+                break;
+              }
             }
           }
         }
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 03f62117ea096..e640044ec762d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2647,152 +2647,74 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
                                                  uint64_t Size) const {
+    // Helper to add load/store pair
+    auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
+                                unsigned Reg, unsigned Offset = 0) {
+      Code.emplace_back(MCInstBuilder(LoadOpc)
+                            .addReg(Reg)
+                            .addReg(AArch64::X1)
+                            .addImm(Offset));
+      Code.emplace_back(MCInstBuilder(StoreOpc)
+                            .addReg(Reg)
+                            .addReg(AArch64::X0)
+                            .addImm(Offset));
+    };
+
     // Generate optimal instruction sequences based on exact size
     switch (Size) {
     case 1:
-      // Single byte copy
-      Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
       break;
-
     case 2:
-      // 2-byte copy using 16-bit load/store
-      Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
       break;
-
     case 4:
-      // 4-byte copy using 32-bit load/store
-      Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRWui)
-                            .addReg(AArch64::W3)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
       break;
-
     case 8:
-      // 8-byte copy using 64-bit load/store
-      Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
-                            .addReg(AArch64::X3)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRXui)
-                            .addReg(AArch64::X3)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
       break;
-
     case 16:
-      // 16-byte copy using 128-bit SIMD
-      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
-                            .addReg(AArch64::Q0)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
-                            .addReg(AArch64::Q0)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
+      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
       break;
-
     case 32:
-      // 32-byte copy using two 128-bit SIMD operations
-      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
-                            .addReg(AArch64::Q0)
-                            .addReg(AArch64::X1)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
-                            .addReg(AArch64::Q0)
-                            .addReg(AArch64::X0)
-                            .addImm(0));
-      Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
-                            .addReg(AArch64::Q1)
-                            .addReg(AArch64::X1)
-                            .addImm(1));
-      Code.emplace_back(MCInstBuilder(AArch64::STRQui)
-                            .addReg(AArch64::Q1)
-                            .addReg(AArch64::X0)
-                            .addImm(1));
+      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
+      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
       break;
 
     default:
       if (Size <= 64) {
-        // For sizes up to 64 bytes, greedily use the largest possible loads in
-        // descending order
+        // For sizes up to 64 bytes, greedily use the largest possible loads
         uint64_t Remaining = Size;
         uint64_t Offset = 0;
 
         while (Remaining >= 16) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
-                                .addReg(AArch64::Q0)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset / 16));
-          Code.emplace_back(MCInstBuilder(AArch64::STRQui)
-                                .addReg(AArch64::Q0)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset / 16));
+          addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0,
+                           Offset / 16);
           Remaining -= 16;
           Offset += 16;
         }
         if (Remaining >= 8) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
-                                .addReg(AArch64::X3)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset / 8));
-          Code.emplace_back(MCInstBuilder(AArch64::STRXui)
-                                .addReg(AArch64::X3)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset / 8));
+          addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3,
+                           Offset / 8);
           Remaining -= 8;
           Offset += 8;
         }
         if (Remaining >= 4) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset / 4));
-          Code.emplace_back(MCInstBuilder(AArch64::STRWui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset / 4));
+          addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3,
+                           Offset / 4);
           Remaining -= 4;
           Offset += 4;
         }
         if (Remaining >= 2) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset / 2));
-          Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset / 2));
+          addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3,
+                           Offset / 2);
           Remaining -= 2;
           Offset += 2;
         }
         if (Remaining == 1) {
-          Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X1)
-                                .addImm(Offset));
-          Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
-                                .addReg(AArch64::W3)
-                                .addReg(AArch64::X0)
-                                .addImm(Offset));
+          addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
+                           Offset);
         }
       } else {
         Code.clear();

>From cf8279a8b5081eec657a1f835c54470653186787 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 03:57:43 -0700
Subject: [PATCH 08/26] Update conditional formatting and move check for size
 into binaryPasses

---
 bolt/lib/Passes/BinaryPasses.cpp                 |  5 +++++
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 13 ++++---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e532c2aa0422d..1aade44286052 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1889,6 +1889,11 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
           }
         }
 
+        if (BC.isAArch64() && !KnownSize.has_value()) {
+          ++II;
+          continue;
+        }
+
         const InstructionListType NewCode =
             BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
         II = BB.replaceInstruction(II, NewCode);
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index e640044ec762d..9d30fdface0c5 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2621,24 +2621,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
                      std::optional<uint64_t> KnownSize) const override {
     InstructionListType Code;
     if (ReturnEnd) {
-      if (KnownSize.has_value() && (*KnownSize >> 12) == 0) {
-        // Use immediate if size is known and fits in 12-bit immediate (0-4095)
+      // Use immediate if size fits in 12-bit immediate (0-4095)
+      // Otherwise, fall back to register add for large sizes
+      if ((*KnownSize >> 12) == 0)
         Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
                               .addReg(AArch64::X0)
                               .addReg(AArch64::X0)
                               .addImm(*KnownSize)
                               .addImm(0));
-      } else {
-        // Fall back to register add for unknown or large sizes
+      else
         Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
                               .addReg(AArch64::X0)
                               .addReg(AArch64::X0)
                               .addReg(AArch64::X2));
-      }
-    }
-
-    if (!KnownSize.has_value()) {
-      return Code;
     }
 
     uint64_t Size = *KnownSize;

>From c317eb0cbd62ac6f164cf44b75d40e082167ce3d Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 04:55:17 -0700
Subject: [PATCH 09/26] Negative Tests (live-in, register move, non-mov
 instruction)

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 61 ++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 0e16b6a7e963f..417b444f6a4bb 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,7 +7,7 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed)
+# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls)
 # CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
@@ -67,6 +67,18 @@
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 # CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
 
+# ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain)
+# CHECK-ASM-LABEL: <test_4_byte_add_immediate>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Register move should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_register_move_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
+# Live-in parameter should NOT be inlined (size unknown at compile time)
+# CHECK-ASM-LABEL: <test_live_in_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
 	.text
 	.globl	test_1_byte_direct                
 	.type	test_1_byte_direct, at function
@@ -172,6 +184,50 @@ test_128_byte_too_large:
 	ret
 	.size	test_128_byte_too_large, .-test_128_byte_too_large
 
+	.globl	test_4_byte_add_immediate
+	.type	test_4_byte_add_immediate, at function
+test_4_byte_add_immediate:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x3, #0
+	add	x2, x3, #4
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_4_byte_add_immediate, .-test_4_byte_add_immediate
+
+	.globl	test_register_move_negative
+	.type	test_register_move_negative, at function
+test_register_move_negative:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x6, #4
+	mov	x2, x6
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_register_move_negative, .-test_register_move_negative
+
+	.globl	test_live_in_negative
+	.type	test_live_in_negative, at function
+test_live_in_negative:
+	# x2 comes in as parameter, no instruction sets it (should NOT inline)
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	# x2 is live-in, no size-setting instruction
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_live_in_negative, .-test_live_in_negative
+
+
+
 	.globl	main
 	.type	main, at function
 main:
@@ -186,6 +242,9 @@ main:
 	bl	test_32_byte_direct
 	bl	test_37_byte_arbitrary
 	bl	test_128_byte_too_large
+	bl	test_4_byte_add_immediate
+	bl	test_register_move_negative
+	bl	test_live_in_negative
 	
 	mov	w0, #0
 	ldp	x29, x30, [sp], #16

>From df97d61befcc9ceaf3d82648a1b68b88cc3e0451 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 06:51:08 -0700
Subject: [PATCH 10/26] memcpy8 redundant handling removed

---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 29 ++++++---------
 bolt/test/runtime/AArch64/inline-memcpy.s     | 37 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9d30fdface0c5..366d4183bca51 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2620,24 +2620,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   createInlineMemcpy(bool ReturnEnd,
                      std::optional<uint64_t> KnownSize) const override {
     InstructionListType Code;
-    if (ReturnEnd) {
-      // Use immediate if size fits in 12-bit immediate (0-4095)
-      // Otherwise, fall back to register add for large sizes
-      if ((*KnownSize >> 12) == 0)
-        Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
-                              .addReg(AArch64::X0)
-                              .addReg(AArch64::X0)
-                              .addImm(*KnownSize)
-                              .addImm(0));
-      else
-        Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
-                              .addReg(AArch64::X0)
-                              .addReg(AArch64::X0)
-                              .addReg(AArch64::X2));
-    }
-
     uint64_t Size = *KnownSize;
-    return generateSizeSpecificMemcpy(Code, Size);
+
+    // Generate the optimized memcpy sequence
+    generateSizeSpecificMemcpy(Code, Size);
+
+    // If _memcpy8, adjust X0 to return dest+size instead of dest
+    if (ReturnEnd)
+      Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
+                            .addReg(AArch64::X0)
+                            .addReg(AArch64::X0)
+                            .addImm(Size)
+                            .addImm(0));
+    return Code;
   }
 
   InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 417b444f6a4bb..961e21f82851d 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
 
@@ -79,6 +79,13 @@
 # CHECK-ASM-LABEL: <test_live_in_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
+# _memcpy8 should be inlined with end-pointer return (dest+size)
+# CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
+# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
+# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: add{{.*}}x0, x0, #0x4
+# CHECK-ASM-NOT: bl{{.*}}<_memcpy8
+
 	.text
 	.globl	test_1_byte_direct                
 	.type	test_1_byte_direct, at function
@@ -226,7 +233,31 @@ test_live_in_negative:
 	ret
 	.size	test_live_in_negative, .-test_live_in_negative
 
+	.globl	test_memcpy8_4_byte
+	.type	test_memcpy8_4_byte, at function
+test_memcpy8_4_byte:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #4
+	bl	_memcpy8
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_memcpy8_4_byte, .-test_memcpy8_4_byte
 
+	# Simple _memcpy8 implementation that calls memcpy and returns dest+size
+	.globl	_memcpy8
+	.type	_memcpy8, at function
+_memcpy8:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	mov	x3, x0
+	bl	memcpy
+	add	x0, x3, x2
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	_memcpy8, .-_memcpy8
 
 	.globl	main
 	.type	main, at function
@@ -245,6 +276,8 @@ main:
 	bl	test_4_byte_add_immediate
 	bl	test_register_move_negative
 	bl	test_live_in_negative
+	bl	test_memcpy8_4_byte
+	bl	test_memcpy8_large_size
 	
 	mov	w0, #0
 	ldp	x29, x30, [sp], #16

>From 25cfb58b165fd1190f9b1b52cce1423d2db5d3c1 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Wed, 27 Aug 2025 06:54:14 -0700
Subject: [PATCH 11/26] nit: comment clean up

---
 bolt/lib/Passes/BinaryPasses.cpp                 | 6 +++---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 1aade44286052..e8124dd3cb4f4 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1866,14 +1866,14 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
         const bool IsTailCall = BC.MIB->isTailCall(Inst);
 
-        // Extract size from preceding instructions (AArch64 only)
-        // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2
+        // Extract size from preceding instructions (AArch64 only).
+        // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
         std::optional<uint64_t> KnownSize = std::nullopt;
         if (BC.isAArch64()) {
           BitVector WrittenRegs(BC.MRI->getNumRegs());
           MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
 
-          // Look backwards for size-setting instruction
+          // Look backwards for size-setting instruction.
           for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
             MCInst &Inst = *InstIt;
             WrittenRegs.reset();
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 366d4183bca51..67febc2324e14 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2622,10 +2622,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     InstructionListType Code;
     uint64_t Size = *KnownSize;
 
-    // Generate the optimized memcpy sequence
+    // Generate the optimized memcpy sequence.
     generateSizeSpecificMemcpy(Code, Size);
 
-    // If _memcpy8, adjust X0 to return dest+size instead of dest
+    // If _memcpy8, adjust X0 to return dest+size instead of dest.
     if (ReturnEnd)
       Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
                             .addReg(AArch64::X0)
@@ -2637,7 +2637,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
                                                  uint64_t Size) const {
-    // Helper to add load/store pair
     auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
                                 unsigned Reg, unsigned Offset = 0) {
       Code.emplace_back(MCInstBuilder(LoadOpc)
@@ -2650,7 +2649,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
                             .addImm(Offset));
     };
 
-    // Generate optimal instruction sequences based on exact size
+    // Generate optimal instruction sequences based on exact size.
     switch (Size) {
     case 1:
       addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
@@ -2674,7 +2673,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
     default:
       if (Size <= 64) {
-        // For sizes up to 64 bytes, greedily use the largest possible loads
+        // For sizes up to 64 bytes, greedily use the largest possible loads.
         uint64_t Remaining = Size;
         uint64_t Offset = 0;
 

>From e308855758965504cca82484f66065d186c64093 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 02:12:27 -0700
Subject: [PATCH 12/26] minor refactor

---
 bolt/lib/Passes/BinaryPasses.cpp              | 11 +++++-----
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 22 +++++++------------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index e8124dd3cb4f4..022d06ae80e7b 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1872,6 +1872,7 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         if (BC.isAArch64()) {
           BitVector WrittenRegs(BC.MRI->getNumRegs());
           MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
+          std::optional<uint64_t> ExtractedSize;
 
           // Look backwards for size-setting instruction.
           for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
@@ -1879,12 +1880,10 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
             WrittenRegs.reset();
             BC.MIB->getWrittenRegs(Inst, WrittenRegs);
 
-            if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
-              if (std::optional<uint64_t> ExtractedSize =
-                      BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
-                KnownSize = *ExtractedSize;
-                break;
-              }
+            if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] &&
+                (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) {
+              KnownSize = *ExtractedSize;
+              break;
             }
           }
         }
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 67febc2324e14..dfb5fe3cfe30d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2599,20 +2599,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   }
 
   InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
-    // Fallback
     return createInlineMemcpy(ReturnEnd, std::nullopt);
   }
 
   std::optional<uint64_t>
   extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
-    if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) {
-      if (Inst.getOperand(0).isReg() &&
-          Inst.getOperand(0).getReg() == TargetReg &&
-          Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
-          Inst.getOperand(2).getImm() == 0) {
-        return Inst.getOperand(1).getImm();
-      }
-    }
+    if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 &&
+        Inst.getOperand(0).isReg() &&
+        Inst.getOperand(0).getReg() == TargetReg &&
+        Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
+        Inst.getOperand(2).getImm() == 0)
+      return Inst.getOperand(1).getImm();
     return std::nullopt;
   }
 
@@ -2622,7 +2619,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     InstructionListType Code;
     uint64_t Size = *KnownSize;
 
-    // Generate the optimized memcpy sequence.
     generateSizeSpecificMemcpy(Code, Size);
 
     // If _memcpy8, adjust X0 to return dest+size instead of dest.
@@ -2701,13 +2697,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
           Remaining -= 2;
           Offset += 2;
         }
-        if (Remaining == 1) {
+        if (Remaining == 1)
           addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
                            Offset);
-        }
-      } else {
+      } else
         Code.clear();
-      }
       break;
     }
     return Code;

>From 365a0bfaa0d68e9a5c45f9b5163af49ca6d5c1b8 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 06:33:39 -0700
Subject: [PATCH 13/26] NFC: Post-review refactor

---
 bolt/include/bolt/Core/MCPlusBuilder.h        | 10 +++
 bolt/lib/Passes/BinaryPasses.cpp              | 21 +----
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 81 ++++++++++---------
 3 files changed, 55 insertions(+), 57 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 6cbf288f3b8f4..3192472f5fbe0 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -14,6 +14,7 @@
 #ifndef BOLT_CORE_MCPLUSBUILDER_H
 #define BOLT_CORE_MCPLUSBUILDER_H
 
+#include "bolt/Core/BinaryBasicBlock.h"
 #include "bolt/Core/MCPlus.h"
 #include "bolt/Core/Relocation.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -1888,6 +1889,15 @@ class MCPlusBuilder {
     return {};
   }
 
+  /// Find memcpy size in bytes by using preceding instructions.
+  /// Returns std::nullopt if size cannot be determined (no-op for most
+  /// targets).
+  virtual std::optional<uint64_t>
+  findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+                        BinaryBasicBlock::iterator CallInst) const {
+    return std::nullopt;
+  }
+
   /// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
   /// (dest + n) instead of dest.
   virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 022d06ae80e7b..f1807f6eb997e 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1868,25 +1868,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
 
         // Extract size from preceding instructions (AArch64 only).
         // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
-        std::optional<uint64_t> KnownSize = std::nullopt;
-        if (BC.isAArch64()) {
-          BitVector WrittenRegs(BC.MRI->getNumRegs());
-          MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);
-          std::optional<uint64_t> ExtractedSize;
-
-          // Look backwards for size-setting instruction.
-          for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
-            MCInst &Inst = *InstIt;
-            WrittenRegs.reset();
-            BC.MIB->getWrittenRegs(Inst, WrittenRegs);
-
-            if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] &&
-                (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) {
-              KnownSize = *ExtractedSize;
-              break;
-            }
-          }
-        }
+        std::optional<uint64_t> KnownSize =
+            BC.MIB->findMemcpySizeInBytes(BB, II);
 
         if (BC.isAArch64() && !KnownSize.has_value()) {
           ++II;
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index dfb5fe3cfe30d..6f539b8588f2e 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2604,15 +2604,33 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   std::optional<uint64_t>
   extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
-    if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 &&
-        Inst.getOperand(0).isReg() &&
+    // Match MOVZXi with the target register and no shift.
+    if (Inst.getOpcode() == AArch64::MOVZXi &&
         Inst.getOperand(0).getReg() == TargetReg &&
-        Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
         Inst.getOperand(2).getImm() == 0)
       return Inst.getOperand(1).getImm();
     return std::nullopt;
   }
 
+  std::optional<uint64_t>
+  findMemcpySizeInBytes(const BinaryBasicBlock &BB,
+                        BinaryBasicBlock::iterator CallInst) const override {
+    BitVector WrittenRegs(RegInfo->getNumRegs());
+    MCPhysReg SizeReg = getIntArgRegister(2);
+    std::optional<uint64_t> ExtractedSize;
+
+    for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
+      const MCInst &Inst = *InstIt;
+      WrittenRegs.reset();
+      getWrittenRegs(Inst, WrittenRegs);
+
+      if (SizeReg != getNoRegister() && WrittenRegs[SizeReg] &&
+          (ExtractedSize = extractMoveImmediate(Inst, SizeReg)))
+        return *ExtractedSize;
+    }
+    return std::nullopt;
+  }
+
   InstructionListType
   createInlineMemcpy(bool ReturnEnd,
                      std::optional<uint64_t> KnownSize) const override {
@@ -2633,7 +2651,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
                                                  uint64_t Size) const {
-    auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
+    auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
                                 unsigned Reg, unsigned Offset = 0) {
       Code.emplace_back(MCInstBuilder(LoadOpc)
                             .addReg(Reg)
@@ -2648,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     // Generate optimal instruction sequences based on exact size.
     switch (Size) {
     case 1:
-      addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
       break;
     case 2:
-      addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
       break;
     case 4:
-      addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
       break;
     case 8:
-      addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
+      AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
       break;
     case 16:
-      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
       break;
     case 32:
-      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
-      addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
       break;
 
     default:
@@ -2673,33 +2691,20 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
         uint64_t Remaining = Size;
         uint64_t Offset = 0;
 
-        while (Remaining >= 16) {
-          addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0,
-                           Offset / 16);
-          Remaining -= 16;
-          Offset += 16;
-        }
-        if (Remaining >= 8) {
-          addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3,
-                           Offset / 8);
-          Remaining -= 8;
-          Offset += 8;
-        }
-        if (Remaining >= 4) {
-          addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3,
-                           Offset / 4);
-          Remaining -= 4;
-          Offset += 4;
-        }
-        if (Remaining >= 2) {
-          addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3,
-                           Offset / 2);
-          Remaining -= 2;
-          Offset += 2;
-        }
-        if (Remaining == 1)
-          addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3,
-                           Offset);
+        const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
+            LoadStoreOps = {
+                {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0},
+                 {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3},
+                 {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3},
+                 {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3},
+                 {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}};
+
+        for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
+          while (Remaining >= OpSize) {
+            AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
+            Remaining -= OpSize;
+            Offset += OpSize;
+          }
       } else
         Code.clear();
       break;

>From 84c904ac68b263b48227b3308ad16c795382b7c3 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 06:42:47 -0700
Subject: [PATCH 14/26] NFC: Test for corner case with size 0

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 25 ++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 961e21f82851d..3acb5e394d52d 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
 
@@ -62,6 +62,12 @@
 # CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
+# 0-byte copy should be inlined with no load/store instructions (nothing to copy)
+# CHECK-ASM-LABEL: <test_0_byte>:
+# CHECK-ASM-NOT: ldr
+# CHECK-ASM-NOT: str  
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
 # 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
 # CHECK-ASM-LABEL: <test_128_byte_too_large>:
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
@@ -178,6 +184,19 @@ test_37_byte_arbitrary:
 	ret
 	.size	test_37_byte_arbitrary, .-test_37_byte_arbitrary
 
+	.globl	test_0_byte
+	.type	test_0_byte, at function
+test_0_byte:
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #0
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_0_byte, .-test_0_byte
+
 	.globl	test_128_byte_too_large
 	.type	test_128_byte_too_large, at function
 test_128_byte_too_large:
@@ -272,12 +291,12 @@ main:
 	bl	test_16_byte_direct  
 	bl	test_32_byte_direct
 	bl	test_37_byte_arbitrary
+	bl	test_0_byte
 	bl	test_128_byte_too_large
 	bl	test_4_byte_add_immediate
 	bl	test_register_move_negative
 	bl	test_live_in_negative
 	bl	test_memcpy8_4_byte
-	bl	test_memcpy8_large_size
 	
 	mov	w0, #0
 	ldp	x29, x30, [sp], #16

>From 0561bccf755709811eed3d13e10bdcd2afa5fbe3 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:01:21 -0700
Subject: [PATCH 15/26] Use temp instead of argument registers

---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 6f539b8588f2e..f17a91bc3ba76 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2666,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     // Generate optimal instruction sequences based on exact size.
     switch (Size) {
     case 1:
-      AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
       break;
     case 2:
-      AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
       break;
     case 4:
-      AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3);
+      AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
       break;
     case 8:
-      AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3);
+      AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
       break;
     case 16:
-      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
       break;
     case 32:
-      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0);
-      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
+      AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
       break;
 
     default:
@@ -2693,11 +2693,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
         const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
             LoadStoreOps = {
-                {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0},
-                 {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3},
-                 {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3},
-                 {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3},
-                 {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}};
+                {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
+                 {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
+                 {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
+                 {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
+                 {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
 
         for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
           while (Remaining >= OpSize) {

>From cc49db79eea544305571e5e91caa3328c91cf4a7 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:01:54 -0700
Subject: [PATCH 16/26] Update early return

---
 bolt/lib/Passes/BinaryPasses.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index f1807f6eb997e..d40f5fb78c7f3 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1871,10 +1871,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         std::optional<uint64_t> KnownSize =
             BC.MIB->findMemcpySizeInBytes(BB, II);
 
-        if (BC.isAArch64() && !KnownSize.has_value()) {
-          ++II;
+        if (BC.isAArch64() && !KnownSize.has_value())
           continue;
-        }
 
         const InstructionListType NewCode =
             BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);

>From 115606be208c8b6675df59b9f231dd709ea863fd Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 28 Aug 2025 10:02:48 -0700
Subject: [PATCH 17/26] Update tests to be more specific about registers +
 negative test on early return check

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 70 +++++++++++++++--------
 1 file changed, 45 insertions(+), 25 deletions(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 3acb5e394d52d..14a95d91dd189 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,59 +7,59 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls)
+# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 14 total calls)
 # CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
 
 # 1-byte copy should use single byte load/store (ldrb/strb)
 # CHECK-ASM-LABEL: <test_1_byte_direct>:
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldrb{{.*}}w9, [x1]
+# CHECK-ASM: strb{{.*}}w9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 2-byte copy should use single 16-bit load/store (ldrh/strh)
 # CHECK-ASM-LABEL: <test_2_byte_direct>:
-# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldrh{{.*}}w9, [x1]
+# CHECK-ASM: strh{{.*}}w9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 4-byte copy should use single 32-bit load/store (w register)
 # CHECK-ASM-LABEL: <test_4_byte_direct>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM: str{{.*}}w9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 8-byte copy should use single 64-bit load/store (x register)
 # CHECK-ASM-LABEL: <test_8_byte_direct>:
-# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}x9, [x1]
+# CHECK-ASM: str{{.*}}x9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 16-byte copy should use single 128-bit SIMD load/store (q register)
 # CHECK-ASM-LABEL: <test_16_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 32-byte copy should use two 128-bit SIMD operations
 # CHECK-ASM-LABEL: <test_32_byte_direct>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q17, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q17, [x0, #0x10]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
 # CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0]
-# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24]
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}w9, [x1, #0x20]
+# CHECK-ASM: str{{.*}}w9, [x0, #0x20]
+# CHECK-ASM: ldrb{{.*}}w9, [x1, #0x24]
+# CHECK-ASM: strb{{.*}}w9, [x0, #0x24]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 0-byte copy should be inlined with no load/store instructions (nothing to copy)
@@ -85,10 +85,14 @@
 # CHECK-ASM-LABEL: <test_live_in_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
+# Register-based size should NOT be inlined (isAArch64 & size unknown at compile time)
+# CHECK-ASM-LABEL: <test_register_size_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
 # _memcpy8 should be inlined with end-pointer return (dest+size)
 # CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
-# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1]
-# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0]
+# CHECK-ASM: ldr{{.*}}w9, [x1]
+# CHECK-ASM: str{{.*}}w9, [x0]
 # CHECK-ASM: add{{.*}}x0, x0, #0x4
 # CHECK-ASM-NOT: bl{{.*}}<_memcpy8
 
@@ -252,6 +256,21 @@ test_live_in_negative:
 	ret
 	.size	test_live_in_negative, .-test_live_in_negative
 
+	.globl	test_register_size_negative
+	.type	test_register_size_negative, at function
+test_register_size_negative:
+	# This would crash without isAArch64() check: size from register parameter
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x3, #4
+	mov	x2, x3
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_register_size_negative, .-test_register_size_negative
+
 	.globl	test_memcpy8_4_byte
 	.type	test_memcpy8_4_byte, at function
 test_memcpy8_4_byte:
@@ -296,6 +315,7 @@ main:
 	bl	test_4_byte_add_immediate
 	bl	test_register_move_negative
 	bl	test_live_in_negative
+	bl	test_register_size_negative
 	bl	test_memcpy8_4_byte
 	
 	mov	w0, #0

>From 1986bfac3fcfdd3b8036096c72d7f1ed03fea1bc Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 29 Aug 2025 08:03:58 -0700
Subject: [PATCH 18/26] Complex test + register aliasing

---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   |  18 +--
 bolt/test/runtime/AArch64/inline-memcpy.s     | 107 +++++++++++++++++-
 2 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index f17a91bc3ba76..12e226a00e26d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2604,10 +2604,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
   std::optional<uint64_t>
   extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
-    // Match MOVZXi with the target register and no shift.
-    if (Inst.getOpcode() == AArch64::MOVZXi &&
-        Inst.getOperand(0).getReg() == TargetReg &&
-        Inst.getOperand(2).getImm() == 0)
+    // Match MOVZ instructions (both X and W register variants) with no shift.
+    if ((Inst.getOpcode() == AArch64::MOVZXi ||
+         Inst.getOpcode() == AArch64::MOVZWi) &&
+        Inst.getOperand(2).getImm() == 0 &&
+        getAliases(TargetReg)[Inst.getOperand(0).getReg()])
       return Inst.getOperand(1).getImm();
     return std::nullopt;
   }
@@ -2617,16 +2618,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
                         BinaryBasicBlock::iterator CallInst) const override {
     BitVector WrittenRegs(RegInfo->getNumRegs());
     MCPhysReg SizeReg = getIntArgRegister(2);
-    std::optional<uint64_t> ExtractedSize;
+    const BitVector &SizeRegAliases = getAliases(SizeReg);
 
     for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
       const MCInst &Inst = *InstIt;
       WrittenRegs.reset();
       getWrittenRegs(Inst, WrittenRegs);
 
-      if (SizeReg != getNoRegister() && WrittenRegs[SizeReg] &&
-          (ExtractedSize = extractMoveImmediate(Inst, SizeReg)))
-        return *ExtractedSize;
+      if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases)) {
+        if (auto ExtractedSize = extractMoveImmediate(Inst, SizeReg))
+          return *ExtractedSize;
+      }
     }
     return std::nullopt;
   }
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 14a95d91dd189..eb6851bbe7e0b 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 14 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (12 successful inlines out of 16 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 12 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
 
@@ -96,6 +96,24 @@
 # CHECK-ASM: add{{.*}}x0, x0, #0x4
 # CHECK-ASM-NOT: bl{{.*}}<_memcpy8
 
+# Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register
+# CHECK-ASM-LABEL: <complex_operation>:
+# CHECK-ASM: ldr{{.*}}x9, [x1]
+# CHECK-ASM: str{{.*}}x9, [x0]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
+# Complex function with caller-saved Q16/Q17 should inline 64-byte memcpy using Q16 as temp register
+# CHECK-ASM-LABEL: <complex_fp_operation>:
+# CHECK-ASM: ldr{{.*}}q16, [x1]
+# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x20]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x20]
+# CHECK-ASM: ldr{{.*}}q16, [x1, #0x30]
+# CHECK-ASM: str{{.*}}q16, [x0, #0x30]
+# CHECK-ASM-NOT: bl{{.*}}<memcpy
+
 	.text
 	.globl	test_1_byte_direct                
 	.type	test_1_byte_direct, at function
@@ -297,10 +315,80 @@ _memcpy8:
 	ret
 	.size	_memcpy8, .-_memcpy8
 
+	.globl	complex_operation
+	.type	complex_operation, at function
+complex_operation:
+	stp     x29, x30, [sp, #-32]!
+	str     x19, [sp, #16]
+	mov     x29, sp
+	ldp     x9, x10, [x0]
+	ldp     x11, x12, [x0, #16]
+	mov     x19, x1
+	mov     x8, x0
+	add     x0, x1, #32
+	madd    x9, x9, x2, x3
+	and     x10, x10, x4
+	asr     x12, x12, #2
+	mov     w2, #8
+	orr     x11, x12, x11, lsl #3
+	eor     x12, x9, x10
+	mul     x10, x11, x10
+	eor     x12, x12, x11
+	add     x13, x12, x9
+	add     x9, x11, x9, asr #4
+	stp     x13, x10, [x1]
+	mov     w10, w12
+	stp     x9, x10, [x1, #16]
+	add     x1, x8, #32
+	bl      memcpy
+	ldr     x0, [x19, #16]
+	ldr     x19, [sp, #16]
+	ldp     x29, x30, [sp], #32
+	b       use
+	.size	complex_operation, .-complex_operation
+
+	.globl	use
+	.type	use, at function
+use:
+	ret
+	.size	use, .-use
+
+# Same as above but using FP caller-saved registers (Q16/17)
+	.globl	complex_fp_operation
+	.type	complex_fp_operation, at function
+complex_fp_operation:
+	stp     x29, x30, [sp, #-48]!
+	stp     q8, q9, [sp, #16]
+	mov     x29, sp
+	ldr     q16, [x0]
+	ldr     q17, [x0, #16]
+	mov     x8, x0
+	add     x0, x1, #32
+	fadd    v16.4s, v16.4s, v17.4s
+	fmul    v17.4s, v16.4s, v17.4s
+	fsub    v16.2d, v16.2d, v17.2d
+	mov     w2, #64
+	fmax    v17.4s, v16.4s, v17.4s
+	fmin    v16.2d, v16.2d, v17.2d
+	str     q16, [x1]
+	str     q17, [x1, #16]
+	add     x1, x8, #32
+	bl      memcpy
+	ldp     q8, q9, [sp, #16]
+	ldp     x29, x30, [sp], #48
+	b       use_fp
+	.size	complex_fp_operation, .-complex_fp_operation
+
+	.globl	use_fp
+	.type	use_fp, at function
+use_fp:
+	ret
+	.size	use_fp, .-use_fp
+
 	.globl	main
 	.type	main, at function
 main:
-	stp	x29, x30, [sp, #-16]!
+	stp	x29, x30, [sp, #-208]!
 	mov	x29, sp
 	
 	bl	test_1_byte_direct
@@ -318,7 +406,18 @@ main:
 	bl	test_register_size_negative
 	bl	test_memcpy8_4_byte
 	
+	add     x0, sp, #32
+	add     x1, sp, #96
+	mov     x2, #10
+	mov     x3, #20
+	mov     x4, #0xFF
+	bl      complex_operation
+	
+	add     x0, sp, #160
+	add     x1, sp, #96
+	bl      complex_fp_operation
+	
 	mov	w0, #0
-	ldp	x29, x30, [sp], #16
+	ldp	x29, x30, [sp], #208
 	ret
 	.size	main, .-main

>From bd990ea7582ee01e5872014d05470d9fafdfea2c Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Mon, 1 Sep 2025 01:40:32 -0700
Subject: [PATCH 19/26] NFC use if initializer

---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 12e226a00e26d..707856b5874ea 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2625,10 +2625,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
       WrittenRegs.reset();
       getWrittenRegs(Inst, WrittenRegs);
 
-      if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases)) {
-        if (auto ExtractedSize = extractMoveImmediate(Inst, SizeReg))
-          return *ExtractedSize;
-      }
+      if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases);
+          auto ExtractedSize = extractMoveImmediate(Inst, SizeReg))
+        return *ExtractedSize;
     }
     return std::nullopt;
   }

>From ee5f859f26eb3272934ff03cef8bcb52ab772e89 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 09:07:33 -0700
Subject: [PATCH 20/26] [style] trailing whitespaces removed

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 52 +++++++++++------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index eb6851bbe7e0b..0bcb7514afad3 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -3,7 +3,7 @@
 # REQUIRES: system-linux, aarch64-registered-target
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
-# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q 
+# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
@@ -65,7 +65,7 @@
 # 0-byte copy should be inlined with no load/store instructions (nothing to copy)
 # CHECK-ASM-LABEL: <test_0_byte>:
 # CHECK-ASM-NOT: ldr
-# CHECK-ASM-NOT: str  
+# CHECK-ASM-NOT: str
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
@@ -115,52 +115,52 @@
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 	.text
-	.globl	test_1_byte_direct                
+	.globl	test_1_byte_direct
 	.type	test_1_byte_direct, at function
-test_1_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
+test_1_byte_direct:
+	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
 	add	x1, sp, #16
-	add	x0, sp, #8  
+	add	x0, sp, #8
 	mov	x2, #1
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
 	.size	test_1_byte_direct, .-test_1_byte_direct
 
-	.globl	test_2_byte_direct                
+	.globl	test_2_byte_direct
 	.type	test_2_byte_direct, at function
-test_2_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
+test_2_byte_direct:
+	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
 	add	x1, sp, #16
-	add	x0, sp, #8  
+	add	x0, sp, #8
 	mov	x2, #2
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
 	.size	test_2_byte_direct, .-test_2_byte_direct
 
-	.globl	test_4_byte_direct                
+	.globl	test_4_byte_direct
 	.type	test_4_byte_direct, at function
-test_4_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
+test_4_byte_direct:
+	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
 	add	x1, sp, #16
-	add	x0, sp, #8  
+	add	x0, sp, #8
 	mov	x2, #4
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
 	.size	test_4_byte_direct, .-test_4_byte_direct
 
-	.globl	test_8_byte_direct                
+	.globl	test_8_byte_direct
 	.type	test_8_byte_direct, at function
-test_8_byte_direct:                              
-	stp	x29, x30, [sp, #-32]!           
+test_8_byte_direct:
+	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
 	add	x1, sp, #16
-	add	x0, sp, #8  
+	add	x0, sp, #8
 	mov	x2, #8
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
@@ -185,7 +185,7 @@ test_16_byte_direct:
 test_32_byte_direct:
 	stp	x29, x30, [sp, #-80]!
 	mov	x29, sp
-	add	x1, sp, #16  
+	add	x1, sp, #16
 	add	x0, sp, #48
 	mov	x2, #32
 	bl	memcpy
@@ -198,7 +198,7 @@ test_32_byte_direct:
 test_37_byte_arbitrary:
 	stp	x29, x30, [sp, #-96]!
 	mov	x29, sp
-	add	x1, sp, #16  
+	add	x1, sp, #16
 	add	x0, sp, #56
 	mov	x2, #37
 	bl	memcpy
@@ -224,7 +224,7 @@ test_0_byte:
 test_128_byte_too_large:
 	stp	x29, x30, [sp, #-288]!
 	mov	x29, sp
-	add	x1, sp, #16  
+	add	x1, sp, #16
 	add	x0, sp, #152
 	mov	x2, #128
 	bl	memcpy
@@ -390,12 +390,12 @@ use_fp:
 main:
 	stp	x29, x30, [sp, #-208]!
 	mov	x29, sp
-	
+
 	bl	test_1_byte_direct
 	bl	test_2_byte_direct
 	bl	test_4_byte_direct
 	bl	test_8_byte_direct
-	bl	test_16_byte_direct  
+	bl	test_16_byte_direct
 	bl	test_32_byte_direct
 	bl	test_37_byte_arbitrary
 	bl	test_0_byte
@@ -405,18 +405,18 @@ main:
 	bl	test_live_in_negative
 	bl	test_register_size_negative
 	bl	test_memcpy8_4_byte
-	
+
 	add     x0, sp, #32
 	add     x1, sp, #96
 	mov     x2, #10
 	mov     x3, #20
 	mov     x4, #0xFF
 	bl      complex_operation
-	
+
 	add     x0, sp, #160
 	add     x1, sp, #96
 	bl      complex_fp_operation
-	
+
 	mov	w0, #0
 	ldp	x29, x30, [sp], #208
 	ret

>From ad503a791330dd6072a89ebbd73eac71829629c4 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 09:18:07 -0700
Subject: [PATCH 21/26] [test] CHECK-NEXT used

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 50 +++++++++++------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 0bcb7514afad3..3222935b74fef 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -15,51 +15,51 @@
 # 1-byte copy should use single byte load/store (ldrb/strb)
 # CHECK-ASM-LABEL: <test_1_byte_direct>:
 # CHECK-ASM: ldrb{{.*}}w9, [x1]
-# CHECK-ASM: strb{{.*}}w9, [x0]
+# CHECK-ASM-NEXT: strb{{.*}}w9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 2-byte copy should use single 16-bit load/store (ldrh/strh)
 # CHECK-ASM-LABEL: <test_2_byte_direct>:
 # CHECK-ASM: ldrh{{.*}}w9, [x1]
-# CHECK-ASM: strh{{.*}}w9, [x0]
+# CHECK-ASM-NEXT: strh{{.*}}w9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 4-byte copy should use single 32-bit load/store (w register)
 # CHECK-ASM-LABEL: <test_4_byte_direct>:
 # CHECK-ASM: ldr{{.*}}w9, [x1]
-# CHECK-ASM: str{{.*}}w9, [x0]
+# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 8-byte copy should use single 64-bit load/store (x register)
 # CHECK-ASM-LABEL: <test_8_byte_direct>:
 # CHECK-ASM: ldr{{.*}}x9, [x1]
-# CHECK-ASM: str{{.*}}x9, [x0]
+# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 16-byte copy should use single 128-bit SIMD load/store (q register)
 # CHECK-ASM-LABEL: <test_16_byte_direct>:
 # CHECK-ASM: ldr{{.*}}q16, [x1]
-# CHECK-ASM: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 32-byte copy should use two 128-bit SIMD operations
 # CHECK-ASM-LABEL: <test_32_byte_direct>:
 # CHECK-ASM: ldr{{.*}}q16, [x1]
-# CHECK-ASM: str{{.*}}q16, [x0]
-# CHECK-ASM: ldr{{.*}}q17, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q17, [x0, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: ldr{{.*}}q17, [x1, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q17, [x0, #0x10]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 37-byte copy should use greedy decomposition: (2*16) + (1*4) + (1*1)
 # CHECK-ASM-LABEL: <test_37_byte_arbitrary>:
 # CHECK-ASM: ldr{{.*}}q16, [x1]
-# CHECK-ASM: str{{.*}}q16, [x0]
-# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}w9, [x1, #0x20]
-# CHECK-ASM: str{{.*}}w9, [x0, #0x20]
-# CHECK-ASM: ldrb{{.*}}w9, [x1, #0x24]
-# CHECK-ASM: strb{{.*}}w9, [x0, #0x24]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM-NEXT: ldr{{.*}}w9, [x1, #0x20]
+# CHECK-ASM-NEXT: str{{.*}}w9, [x0, #0x20]
+# CHECK-ASM-NEXT: ldrb{{.*}}w9, [x1, #0x24]
+# CHECK-ASM-NEXT: strb{{.*}}w9, [x0, #0x24]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # 0-byte copy should be inlined with no load/store instructions (nothing to copy)
@@ -92,26 +92,26 @@
 # _memcpy8 should be inlined with end-pointer return (dest+size)
 # CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
 # CHECK-ASM: ldr{{.*}}w9, [x1]
-# CHECK-ASM: str{{.*}}w9, [x0]
-# CHECK-ASM: add{{.*}}x0, x0, #0x4
+# CHECK-ASM-NEXT: str{{.*}}w9, [x0]
+# CHECK-ASM-NEXT: add{{.*}}x0, x0, #0x4
 # CHECK-ASM-NOT: bl{{.*}}<_memcpy8
 
 # Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register
 # CHECK-ASM-LABEL: <complex_operation>:
 # CHECK-ASM: ldr{{.*}}x9, [x1]
-# CHECK-ASM: str{{.*}}x9, [x0]
+# CHECK-ASM-NEXT: str{{.*}}x9, [x0]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 # Complex function with caller-saved Q16/Q17 should inline 64-byte memcpy using Q16 as temp register
 # CHECK-ASM-LABEL: <complex_fp_operation>:
 # CHECK-ASM: ldr{{.*}}q16, [x1]
-# CHECK-ASM: str{{.*}}q16, [x0]
-# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10]
-# CHECK-ASM: str{{.*}}q16, [x0, #0x10]
-# CHECK-ASM: ldr{{.*}}q16, [x1, #0x20]
-# CHECK-ASM: str{{.*}}q16, [x0, #0x20]
-# CHECK-ASM: ldr{{.*}}q16, [x1, #0x30]
-# CHECK-ASM: str{{.*}}q16, [x0, #0x30]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x20]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x20]
+# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x30]
+# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x30]
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
 	.text

>From 267432aeba503799df057914112ae3450a53fc9b Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 09:58:09 -0700
Subject: [PATCH 22/26] [test] updated negative test to check for negative size

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 54 ++++-------------------
 1 file changed, 8 insertions(+), 46 deletions(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index 3222935b74fef..ee934bc50dbd5 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -85,8 +85,8 @@
 # CHECK-ASM-LABEL: <test_live_in_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
-# Register-based size should NOT be inlined (isAArch64 & size unknown at compile time)
-# CHECK-ASM-LABEL: <test_register_size_negative>:
+# Negative size should NOT be inlined (invalid size parameter)
+# CHECK-ASM-LABEL: <test_negative_size>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # _memcpy8 should be inlined with end-pointer return (dest+size)
@@ -274,20 +274,19 @@ test_live_in_negative:
 	ret
 	.size	test_live_in_negative, .-test_live_in_negative
 
-	.globl	test_register_size_negative
-	.type	test_register_size_negative, at function
-test_register_size_negative:
-	# This would crash without isAArch64() check: size from register parameter
+	.globl	test_negative_size
+	.type	test_negative_size, at function
+test_negative_size:
+	# Negative size should not be inlined
 	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
 	add	x1, sp, #16
 	add	x0, sp, #8
-	mov	x3, #4
-	mov	x2, x3
+	mov	x2, #-1
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
-	.size	test_register_size_negative, .-test_register_size_negative
+	.size	test_negative_size, .-test_negative_size
 
 	.globl	test_memcpy8_4_byte
 	.type	test_memcpy8_4_byte, at function
@@ -384,40 +383,3 @@ complex_fp_operation:
 use_fp:
 	ret
 	.size	use_fp, .-use_fp
-
-	.globl	main
-	.type	main, at function
-main:
-	stp	x29, x30, [sp, #-208]!
-	mov	x29, sp
-
-	bl	test_1_byte_direct
-	bl	test_2_byte_direct
-	bl	test_4_byte_direct
-	bl	test_8_byte_direct
-	bl	test_16_byte_direct
-	bl	test_32_byte_direct
-	bl	test_37_byte_arbitrary
-	bl	test_0_byte
-	bl	test_128_byte_too_large
-	bl	test_4_byte_add_immediate
-	bl	test_register_move_negative
-	bl	test_live_in_negative
-	bl	test_register_size_negative
-	bl	test_memcpy8_4_byte
-
-	add     x0, sp, #32
-	add     x1, sp, #96
-	mov     x2, #10
-	mov     x3, #20
-	mov     x4, #0xFF
-	bl      complex_operation
-
-	add     x0, sp, #160
-	add     x1, sp, #96
-	bl      complex_fp_operation
-
-	mov	w0, #0
-	ldp	x29, x30, [sp], #208
-	ret
-	.size	main, .-main

>From 198744d11278c5ec1134252cdccd8bc77ee3380d Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 10:04:51 -0700
Subject: [PATCH 23/26] [nfc] minor refactor

---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 707856b5874ea..9e1cec4c14a93 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2598,10 +2598,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return 4;
   }
 
-  InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
-    return createInlineMemcpy(ReturnEnd, std::nullopt);
-  }
-
   std::optional<uint64_t>
   extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
     // Match MOVZ instructions (both X and W register variants) with no shift.
@@ -2616,8 +2612,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   std::optional<uint64_t>
   findMemcpySizeInBytes(const BinaryBasicBlock &BB,
                         BinaryBasicBlock::iterator CallInst) const override {
-    BitVector WrittenRegs(RegInfo->getNumRegs());
     MCPhysReg SizeReg = getIntArgRegister(2);
+    if (SizeReg == getNoRegister())
+      return std::nullopt;
+
+    BitVector WrittenRegs(RegInfo->getNumRegs());
     const BitVector &SizeRegAliases = getAliases(SizeReg);
 
     for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
@@ -2625,9 +2624,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
       WrittenRegs.reset();
       getWrittenRegs(Inst, WrittenRegs);
 
-      if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases);
-          auto ExtractedSize = extractMoveImmediate(Inst, SizeReg))
-        return *ExtractedSize;
+      if (WrittenRegs.anyCommon(SizeRegAliases))
+        return extractMoveImmediate(Inst, SizeReg);
     }
     return std::nullopt;
   }
@@ -2635,6 +2633,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   InstructionListType
   createInlineMemcpy(bool ReturnEnd,
                      std::optional<uint64_t> KnownSize) const override {
+    assert(KnownSize.has_value() &&
+           "AArch64 memcpy inlining requires known size");
     InstructionListType Code;
     uint64_t Size = *KnownSize;
 

>From 62b871ec4204cd629e2a59e6f07f291c009c0f0a Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Thu, 4 Sep 2025 10:16:34 -0700
Subject: [PATCH 24/26] [bug] memcpy call removed for sizes>64

---
 bolt/lib/Passes/BinaryPasses.cpp                 | 2 +-
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 3 +--
 bolt/test/runtime/AArch64/inline-memcpy.s        | 9 ++++-----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d40f5fb78c7f3..2f1bb21bc1fd8 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1871,7 +1871,7 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
         std::optional<uint64_t> KnownSize =
             BC.MIB->findMemcpySizeInBytes(BB, II);
 
-        if (BC.isAArch64() && !KnownSize.has_value())
+        if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64))
           continue;
 
         const InstructionListType NewCode =
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9e1cec4c14a93..bcc9809b52fab 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2706,8 +2706,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
             Remaining -= OpSize;
             Offset += OpSize;
           }
-      } else
-        Code.clear();
+      }
       break;
     }
     return Code;
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index ee934bc50dbd5..e0072f38db2d2 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,8 +7,8 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (12 successful inlines out of 16 total calls)
-# CHECK-INLINE: BOLT-INFO: inlined 12 memcpy() calls
+# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls)
+# CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
 
@@ -68,10 +68,9 @@
 # CHECK-ASM-NOT: str
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
-# 128-byte copy should be "inlined" by removing the call entirely (too large for real inlining)
+# 128-byte copy should NOT be inlined (too large, original call preserved)
 # CHECK-ASM-LABEL: <test_128_byte_too_large>:
-# CHECK-ASM-NOT: bl{{.*}}<memcpy
-# CHECK-ASM-NOT: ldr{{.*}}q{{[0-9]+}}
+# CHECK-ASM: bl{{.*}}<memcpy
 
 # ADD immediate with non-zero source should NOT be inlined (can't track mov+add chain)
 # CHECK-ASM-LABEL: <test_4_byte_add_immediate>:

>From dcab6acd61085456c885d0d8f76d99138829d25e Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 5 Sep 2025 09:16:48 -0700
Subject: [PATCH 25/26] [nfc][test] reordered test

---
 bolt/test/runtime/AArch64/inline-memcpy.s | 36 +++++++++++------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index e0072f38db2d2..dc59a08b889a7 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -68,6 +68,10 @@
 # CHECK-ASM-NOT: str
 # CHECK-ASM-NOT: bl{{.*}}<memcpy
 
+# Negative size should NOT be inlined (invalid size parameter)
+# CHECK-ASM-LABEL: <test_negative_size>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
 # 128-byte copy should NOT be inlined (too large, original call preserved)
 # CHECK-ASM-LABEL: <test_128_byte_too_large>:
 # CHECK-ASM: bl{{.*}}<memcpy
@@ -84,10 +88,6 @@
 # CHECK-ASM-LABEL: <test_live_in_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
-# Negative size should NOT be inlined (invalid size parameter)
-# CHECK-ASM-LABEL: <test_negative_size>:
-# CHECK-ASM: bl{{.*}}<memcpy
-
 # _memcpy8 should be inlined with end-pointer return (dest+size)
 # CHECK-ASM-LABEL: <test_memcpy8_4_byte>:
 # CHECK-ASM: ldr{{.*}}w9, [x1]
@@ -218,6 +218,20 @@ test_0_byte:
 	ret
 	.size	test_0_byte, .-test_0_byte
 
+	.globl	test_negative_size
+	.type	test_negative_size, at function
+test_negative_size:
+	# Negative size should not be inlined
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	add	x1, sp, #16
+	add	x0, sp, #8
+	mov	x2, #-1
+	bl	memcpy
+	ldp	x29, x30, [sp], #32
+	ret
+	.size	test_negative_size, .-test_negative_size
+
 	.globl	test_128_byte_too_large
 	.type	test_128_byte_too_large, at function
 test_128_byte_too_large:
@@ -273,20 +287,6 @@ test_live_in_negative:
 	ret
 	.size	test_live_in_negative, .-test_live_in_negative
 
-	.globl	test_negative_size
-	.type	test_negative_size, at function
-test_negative_size:
-	# Negative size should not be inlined
-	stp	x29, x30, [sp, #-32]!
-	mov	x29, sp
-	add	x1, sp, #16
-	add	x0, sp, #8
-	mov	x2, #-1
-	bl	memcpy
-	ldp	x29, x30, [sp], #32
-	ret
-	.size	test_negative_size, .-test_negative_size
-
 	.globl	test_memcpy8_4_byte
 	.type	test_memcpy8_4_byte, at function
 test_memcpy8_4_byte:

>From 875156e6bf82cb3e9ba27df0bf541374350ff69e Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at nvidia.com>
Date: Fri, 5 Sep 2025 09:18:20 -0700
Subject: [PATCH 26/26] [nfc] added assert for default case (future-proofing
 for changes to BinaryPasses.cpp)

---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 42 ++++++++++---------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index bcc9809b52fab..eb402a5681c53 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2687,26 +2687,28 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
       break;
 
     default:
-      if (Size <= 64) {
-        // For sizes up to 64 bytes, greedily use the largest possible loads.
-        uint64_t Remaining = Size;
-        uint64_t Offset = 0;
-
-        const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
-            LoadStoreOps = {
-                {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
-                 {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
-                 {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
-                 {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
-                 {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
-
-        for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
-          while (Remaining >= OpSize) {
-            AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
-            Remaining -= OpSize;
-            Offset += OpSize;
-          }
-      }
+      // For sizes up to 64 bytes, greedily use the largest possible loads.
+      // Caller should have already filtered out sizes > 64 bytes.
+      assert(Size <= 64 &&
+             "Size should be <= 64 bytes for AArch64 memcpy inlining");
+
+      uint64_t Remaining = Size;
+      uint64_t Offset = 0;
+
+      const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
+          LoadStoreOps = {
+              {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
+               {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
+               {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
+               {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
+               {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
+
+      for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
+        while (Remaining >= OpSize) {
+          AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
+          Remaining -= OpSize;
+          Offset += OpSize;
+        }
       break;
     }
     return Code;