[llvm] [AArch64] Optimize memcpy for non-power of two sizes (PR #168890)

Cheng Lingfei via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 21 08:19:18 PST 2026


https://github.com/clingfei updated https://github.com/llvm/llvm-project/pull/168890

>From 2857b8e600b7a509ab96afb291222b1e22f5fcd4 Mon Sep 17 00:00:00 2001
From: clingfei <1599101385 at qq.com>
Date: Thu, 20 Nov 2025 23:18:08 +0800
Subject: [PATCH 1/4] [AArch64] Optimize memcpy for non-power of two sizes The
 previous getMemcpyLoadsAndStores implementation would chain load/store
 instructions from "NumLdStInMemcpy - GlueIter - GluedLdStLimit" to
 "NumLdStInMemcpy - GlueIter". This approach caused issues when copying
 non-power-of-two sizes, as it would chain leading load/stores with subsequent
 instructions at non-power-of-two aligned offsets.

This chaining pattern prevented optimal optimizations in aarch64-ldst-opt
pass for these load/store instructions.

This commit modifies the chaining range to be from GlueIter to GlueIter +
GluedLdStLimit, enabling proper optimization of load/store instructions
in aarch64-ldst-opt.
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  10 +-
 llvm/test/CodeGen/AArch64/aarch64-mops.ll     | 126 +++++++++---------
 2 files changed, 67 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1b15a207a2d37..20e2b744acc31 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8746,8 +8746,8 @@ static SDValue getMemcpyLoadsAndStores(
         unsigned GlueIter = 0;
 
         for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
-          unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
-          unsigned IndexTo   = NumLdStInMemcpy - GlueIter;
+          unsigned IndexFrom = GlueIter;
+          unsigned IndexTo = GlueIter + GluedLdStLimit;
 
           chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
                                        OutLoadChains, OutStoreChains);
@@ -8756,9 +8756,9 @@ static SDValue getMemcpyLoadsAndStores(
 
         // Residual ld/st.
         if (RemainingLdStInMemcpy) {
-          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
-                                        RemainingLdStInMemcpy, OutLoadChains,
-                                        OutStoreChains);
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, GlueIter,
+                                       NumLdStInMemcpy, OutLoadChains,
+                                       OutStoreChains);
         }
       }
     }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index 1710fad9f2539..fc64ce7d26d0e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -1407,30 +1407,28 @@ define void @memcpy_inline_300(ptr %dst, ptr %src, i32 %value) {
 ;
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_300:
 ; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q1, q0, [x1, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    add x8, x1, #284
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q1, q0, [x0, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q1, q0, [x1, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q2, q3, [x1, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q1, q0, [x0, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q2, q3, [x0, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q1, q0, [x1, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q2, q3, [x1, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q1, q0, [x0, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q2, q3, [x0, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q1, q0, [x1, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q2, q3, [x1, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q1, q0, [x0, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q2, q3, [x0, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q3, q1, [x1, #256]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x8]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #240]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    add x8, x0, #284
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x8]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q3, q1, [x0, #256]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #240]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    add	x8, x1, #284
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q1, [x1, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x8]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    add	x8, x0, #284
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x8]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q1, [x0, #256]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memcpy_inline_300:
@@ -1536,46 +1534,46 @@ define void @memcpy_inline_300_volatile(ptr %dst, ptr %src, i32 %value) {
 ;
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_300_volatile:
 ; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #32]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q3, [x1, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q3, [x0, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #32]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1, #64]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #96]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q3, [x1, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q3, [x0, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #96]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0, #64]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1, #128]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #160]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q3, [x1, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q3, [x0, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #160]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0, #128]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1, #192]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #224]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q3, [x1, #240]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q3, [x0, #240]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #224]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0, #192]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1, #256]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #272]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x8, [x1, #288]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr w9, [x1, #296]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str w9, [x0, #296]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str x8, [x0, #288]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #272]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #272]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	x8, [x1, #288]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	w9, [x1, #296]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	w9, [x0, #296]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	x8, [x0, #288]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #272]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #208]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q2, [x1, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q3, [x1, #240]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q3, [x0, #240]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q2, [x0, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #208]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #144]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q2, [x1, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q3, [x1, #176]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q3, [x0, #176]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q2, [x0, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #144]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #80]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q2, [x1, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q3, [x1, #112]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q3, [x0, #112]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q2, [x0, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #80]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #16]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q2, [x1, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q3, [x1, #48]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q3, [x0, #48]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q2, [x0, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #16]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memcpy_inline_300_volatile:

>From 44814f8f09c56bfb7b1b1f86eca90d63ae26a263 Mon Sep 17 00:00:00 2001
From: clingfei <1599101385 at qq.com>
Date: Sat, 22 Nov 2025 10:04:23 +0800
Subject: [PATCH 2/4] update test cases

---
 llvm/test/CodeGen/AArch64/aarch64-mops.ll | 223 ++++++++++++++++++++++
 1 file changed, 223 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index fc64ce7d26d0e..ea33e98ec6447 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -1588,6 +1588,229 @@ entry:
   ret void
 }
 
+define void @memcpy_inline_65(ptr %dst, ptr %src, i32 %value) {
+; GISel-WITHOUT-MOPS-O0-LABEL: memcpy_inline_65:
+; GISel-WITHOUT-MOPS-O0:       // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1, #16]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0, #16]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1, #32]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0, #32]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1, #48]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0, #48]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldrb	w8, [x1, #64]
+; GISel-WITHOUT-MOPS-O0-NEXT:    strb	w8, [x0, #64]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ret
+;
+; GISel-WITHOUT-MOPS-O3-LABEL: memcpy_inline_65:
+; GISel-WITHOUT-MOPS-O3:       // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1, #16]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0, #16]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1, #32]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0, #32]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1, #48]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0, #48]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldrb	w8, [x1, #64]
+; GISel-WITHOUT-MOPS-O3-NEXT:    strb	w8, [x0, #64]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ret
+;
+; GISel-MOPS-O0-LABEL: memcpy_inline_65:
+; GISel-MOPS-O0:       // %bb.0: // %entry
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0]
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1, #16]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0, #16]
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1, #32]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0, #32]
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1, #48]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0, #48]
+; GISel-MOPS-O0-NEXT:    ldrb	w8, [x1, #64]
+; GISel-MOPS-O0-NEXT:    strb	w8, [x0, #64]
+; GISel-MOPS-O0-NEXT:    ret
+;
+; GISel-MOPS-O3-LABEL: memcpy_inline_65:
+; GISel-MOPS-O3:       // %bb.0: // %entry
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0]
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1, #16]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0, #16]
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1, #32]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0, #32]
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1, #48]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0, #48]
+; GISel-MOPS-O3-NEXT:    ldrb	w8, [x1, #64]
+; GISel-MOPS-O3-NEXT:    strb	w8, [x0, #64]
+; GISel-MOPS-O3-NEXT:    ret
+;
+; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_65:
+; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldrb	w8, [x1, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    strb	w8, [x0, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
+;
+; SDAG-MOPS-O2-LABEL: memcpy_inline_65:
+; SDAG-MOPS-O2:       // %bb.0: // %entry
+; SDAG-MOPS-O2-NEXT:    ldrb	w8, [x1, #64]
+; SDAG-MOPS-O2-NEXT:    strb	w8, [x0, #64]
+; SDAG-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #32]
+; SDAG-MOPS-O2-NEXT:    ldp	q2, q3, [x1]
+; SDAG-MOPS-O2-NEXT:    stp	q1, q0, [x0, #32]
+; SDAG-MOPS-O2-NEXT:    stp	q2, q3, [x0]
+; SDAG-MOPS-O2-NEXT:    ret
+entry:
+  call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 65, i1 false)
+  ret void
+}
+
+define void @memcpy_inline_64(ptr %dst, ptr %src, i32 %value) {
+; GISel-WITHOUT-MOPS-O0-LABEL: memcpy_inline_64:
+; GISel-WITHOUT-MOPS-O0:       // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1, #16]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0, #16]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1, #32]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0, #32]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1, #48]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0, #48]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ret
+;
+; GISel-WITHOUT-MOPS-O3-LABEL: memcpy_inline_64:
+; GISel-WITHOUT-MOPS-O3:       // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1, #16]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0, #16]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1, #32]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0, #32]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1, #48]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0, #48]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ret
+;
+; GISel-MOPS-O0-LABEL: memcpy_inline_64:
+; GISel-MOPS-O0:       // %bb.0: // %entry
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0]
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1, #16]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0, #16]
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1, #32]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0, #32]
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1, #48]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0, #48]
+; GISel-MOPS-O0-NEXT:    ret
+;
+; GISel-MOPS-O3-LABEL: memcpy_inline_64:
+; GISel-MOPS-O3:       // %bb.0: // %entry
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0]
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1, #16]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0, #16]
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1, #32]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0, #32]
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1, #48]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0, #48]
+; GISel-MOPS-O3-NEXT:    ret
+;
+; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_64:
+; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
+;
+; SDAG-MOPS-O2-LABEL: memcpy_inline_64:
+; SDAG-MOPS-O2:       // %bb.0: // %entry
+; SDAG-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #32]
+; SDAG-MOPS-O2-NEXT:    ldp	q2, q3, [x1]
+; SDAG-MOPS-O2-NEXT:    stp	q1, q0, [x0, #32]
+; SDAG-MOPS-O2-NEXT:    stp	q2, q3, [x0]
+; SDAG-MOPS-O2-NEXT:    ret
+entry:
+  call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 64, i1 false)
+  ret void
+}
+
+define void @memcpy_inline_63(ptr %dst, ptr %src, i32 %value) {
+; GISel-WITHOUT-MOPS-O0-LABEL: memcpy_inline_63:
+; GISel-WITHOUT-MOPS-O0:       // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1, #16]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0, #16]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldr	q0, [x1, #32]
+; GISel-WITHOUT-MOPS-O0-NEXT:    str	q0, [x0, #32]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ldur	q0, [x1, #47]
+; GISel-WITHOUT-MOPS-O0-NEXT:    stur	q0, [x0, #47]
+; GISel-WITHOUT-MOPS-O0-NEXT:    ret
+;
+; GISel-WITHOUT-MOPS-O3-LABEL: memcpy_inline_63:
+; GISel-WITHOUT-MOPS-O3:       // %bb.0: // %entry
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1, #16]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0, #16]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldr	q0, [x1, #32]
+; GISel-WITHOUT-MOPS-O3-NEXT:    str	q0, [x0, #32]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ldur	q0, [x1, #47]
+; GISel-WITHOUT-MOPS-O3-NEXT:    stur	q0, [x0, #47]
+; GISel-WITHOUT-MOPS-O3-NEXT:    ret
+;
+; GISel-MOPS-O0-LABEL: memcpy_inline_63:
+; GISel-MOPS-O0:       // %bb.0: // %entry
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0]
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1, #16]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0, #16]
+; GISel-MOPS-O0-NEXT:    ldr	q0, [x1, #32]
+; GISel-MOPS-O0-NEXT:    str	q0, [x0, #32]
+; GISel-MOPS-O0-NEXT:    ldur	q0, [x1, #47]
+; GISel-MOPS-O0-NEXT:    stur	q0, [x0, #47]
+; GISel-MOPS-O0-NEXT:    ret
+;
+; GISel-MOPS-O3-LABEL: memcpy_inline_63:
+; GISel-MOPS-O3:       // %bb.0: // %entry
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0]
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1, #16]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0, #16]
+; GISel-MOPS-O3-NEXT:    ldr	q0, [x1, #32]
+; GISel-MOPS-O3-NEXT:    str	q0, [x0, #32]
+; GISel-MOPS-O3-NEXT:    ldur	q0, [x1, #47]
+; GISel-MOPS-O3-NEXT:    stur	q0, [x0, #47]
+; GISel-MOPS-O3-NEXT:    ret
+;
+; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_63:
+; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q3, q1, [x1, #16]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldur	q0, [x1, #47]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q2, [x1]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stur	q0, [x0, #47]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q3, q1, [x0, #16]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q2, [x0]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
+;
+; SDAG-MOPS-O2-LABEL: memcpy_inline_63:
+; SDAG-MOPS-O2:       // %bb.0: // %entry
+; SDAG-MOPS-O2-NEXT:    ldp	q3, q1, [x1, #16]
+; SDAG-MOPS-O2-NEXT:    ldur	q0, [x1, #47]
+; SDAG-MOPS-O2-NEXT:    ldr	q2, [x1]
+; SDAG-MOPS-O2-NEXT:    stur	q0, [x0, #47]
+; SDAG-MOPS-O2-NEXT:    stp	q3, q1, [x0, #16]
+; SDAG-MOPS-O2-NEXT:    str	q2, [x0]
+; SDAG-MOPS-O2-NEXT:    ret
+entry:
+  call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 63, i1 false)
+  ret void
+}
+
 define void @memmove_0(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-LABEL: memmove_0:
 ; GISel-WITHOUT-MOPS:       // %bb.0: // %entry

>From cf8a9543af7bde5ef41c5e2d7c8e94d0661b4774 Mon Sep 17 00:00:00 2001
From: clingfei <1599101385 at qq.com>
Date: Thu, 22 Jan 2026 00:04:29 +0800
Subject: [PATCH 3/4] reorganize the order of chaining for load and store
 instructions in memcpy

---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  19 ++-
 llvm/test/CodeGen/AArch64/aarch64-mops.ll     | 112 +++++++++---------
 2 files changed, 65 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 20e2b744acc31..cdd189e42f904 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8745,21 +8745,20 @@ static SDValue getMemcpyLoadsAndStores(
         unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
         unsigned GlueIter = 0;
 
-        for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
-          unsigned IndexFrom = GlueIter;
-          unsigned IndexTo = GlueIter + GluedLdStLimit;
-
-          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
-                                       OutLoadChains, OutStoreChains);
-          GlueIter += GluedLdStLimit;
-        }
-
         // Residual ld/st.
         if (RemainingLdStInMemcpy) {
-          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, GlueIter,
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, NumLdStInMemcpy - RemainingLdStInMemcpy,
                                        NumLdStInMemcpy, OutLoadChains,
                                        OutStoreChains);
         }
+
+        for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
+          unsigned IndexFrom = NumLdStInMemcpy - RemainingLdStInMemcpy - GlueIter - GluedLdStLimit;
+          unsigned IndexTo = NumLdStInMemcpy - RemainingLdStInMemcpy - GlueIter;
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
+                                       OutLoadChains, OutStoreChains);
+          GlueIter += GluedLdStLimit;
+        }
       }
     }
   }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index ea33e98ec6447..fe69718c53e6e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -1407,23 +1407,23 @@ define void @memcpy_inline_300(ptr %dst, ptr %src, i32 %value) {
 ;
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_300:
 ; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #32]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    add	x8, x1, #284
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1, #192]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #224]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0, #192]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #160]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1, #128]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #160]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #96]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1, #64]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #96]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0, #64]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #32]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #32]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0, #192]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q1, [x1, #256]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x8]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    add	x8, x0, #284
@@ -1534,46 +1534,46 @@ define void @memcpy_inline_300_volatile(ptr %dst, ptr %src, i32 %value) {
 ;
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_300_volatile:
 ; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1, #256]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #272]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	x8, [x1, #288]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	w9, [x1, #296]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	w9, [x0, #296]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	x8, [x0, #288]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #272]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0, #256]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1, #192]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q2, [x1, #224]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q3, [x1, #240]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q3, [x0, #240]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q2, [x0, #224]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0, #192]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1, #128]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q2, [x1, #160]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q3, [x1, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q3, [x0, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q2, [x0, #160]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0, #128]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1, #64]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q2, [x1, #96]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q3, [x1, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q3, [x0, #112]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q2, [x0, #96]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0, #64]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q0, [x1]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q1, [x1, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q2, [x1, #32]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr	q3, [x1, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q3, [x0, #48]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q2, [x0, #32]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q1, [x0, #16]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str	q0, [x0]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #16]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q3, [x1, #48]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q3, [x0, #48]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #32]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #16]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #80]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q3, [x1, #112]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q3, [x0, #112]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #96]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #80]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #144]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q3, [x1, #176]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q3, [x0, #176]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #160]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #144]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0, #128]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #208]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q3, [x1, #240]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q3, [x0, #240]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #224]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #208]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0, #192]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x1, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q1, [x1, #272]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x8, [x1, #288]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr w9, [x1, #296]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str w9, [x0, #296]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str x8, [x0, #288]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q1, [x0, #272]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x0, #256]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memcpy_inline_300_volatile:
@@ -1647,22 +1647,22 @@ define void @memcpy_inline_65(ptr %dst, ptr %src, i32 %value) {
 ;
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_inline_65:
 ; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldrb	w8, [x1, #64]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    strb	w8, [x0, #64]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #32]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp	q2, q3, [x1]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q1, q0, [x0, #32]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    stp	q2, q3, [x0]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldrb	w8, [x1, #64]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    strb	w8, [x0, #64]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memcpy_inline_65:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    ldrb	w8, [x1, #64]
-; SDAG-MOPS-O2-NEXT:    strb	w8, [x0, #64]
 ; SDAG-MOPS-O2-NEXT:    ldp	q1, q0, [x1, #32]
 ; SDAG-MOPS-O2-NEXT:    ldp	q2, q3, [x1]
 ; SDAG-MOPS-O2-NEXT:    stp	q1, q0, [x0, #32]
 ; SDAG-MOPS-O2-NEXT:    stp	q2, q3, [x0]
+; SDAG-MOPS-O2-NEXT:    ldrb	w8, [x1, #64]
+; SDAG-MOPS-O2-NEXT:    strb	w8, [x0, #64]
 ; SDAG-MOPS-O2-NEXT:    ret
 entry:
   call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 65, i1 false)

>From a612d8a4ddeb5b3f263dc3c4489f28b1d98ff629 Mon Sep 17 00:00:00 2001
From: clingfei <1599101385 at qq.com>
Date: Thu, 22 Jan 2026 00:19:00 +0800
Subject: [PATCH 4/4] code format

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index cdd189e42f904..960362ea484ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8747,13 +8747,14 @@ static SDValue getMemcpyLoadsAndStores(
 
         // Residual ld/st.
         if (RemainingLdStInMemcpy) {
-          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, NumLdStInMemcpy - RemainingLdStInMemcpy,
-                                       NumLdStInMemcpy, OutLoadChains,
-                                       OutStoreChains);
+          chainLoadsAndStoresForMemcpy(
+              DAG, dl, OutChains, NumLdStInMemcpy - RemainingLdStInMemcpy,
+              NumLdStInMemcpy, OutLoadChains, OutStoreChains);
         }
 
         for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
-          unsigned IndexFrom = NumLdStInMemcpy - RemainingLdStInMemcpy - GlueIter - GluedLdStLimit;
+          unsigned IndexFrom = NumLdStInMemcpy - RemainingLdStInMemcpy -
+                               GlueIter - GluedLdStLimit;
           unsigned IndexTo = NumLdStInMemcpy - RemainingLdStInMemcpy - GlueIter;
           chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
                                        OutLoadChains, OutStoreChains);



More information about the llvm-commits mailing list