[llvm] AArch64: Optimize memmove for non-power-of-two sizes (PR #168633)
Osama Abdelkader via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 20 09:01:36 PST 2025
https://github.com/osamakader updated https://github.com/llvm/llvm-project/pull/168633
>From b54109cc70596dab091c804ff2ae685a1404560d Mon Sep 17 00:00:00 2001
From: Osama Abdelkader <osama.abdelkader at gmail.com>
Date: Wed, 19 Nov 2025 01:56:10 +0200
Subject: [PATCH 1/2] AArch64: Optimize memmove for non-power-of-two sizes
using overlapping loads/stores
This change improves memmove code generation for non-power-of-two sizes
on AArch64 by using overlapping loads/stores instead of mixed-size
operations, matching GCC's approach.
For example, for a 7-byte memmove:
- Before: ldrb + ldrh + ldr (3 loads, 3 stores)
- After: ldur w8, [x1, #3] + ldr w9, [x1] (2 loads, 2 stores)
The optimization handles sizes 5-65 bytes that are not multiples of 8:
- 5-7 bytes: two overlapping i32 operations
- 9-15 bytes: two overlapping i64 operations
- 17-23 bytes: two i64 + one overlapping i64
- 25-31 bytes: one v16i8 vector + one overlapping i64
- 33-47 bytes: two v16i8 vectors + one overlapping i64
- 49-63 bytes: three v16i8 vectors + one overlapping i64
- 65 bytes: four v16i8 vectors + one overlapping i64
This addresses issue #165948 where LLVM generated suboptimal code
compared to GCC for non-power-of-two memmove sizes.
Signed-off-by: Osama Abdelkader <osama.abdelkader at gmail.com>
---
.../Target/AArch64/AArch64ISelLowering.cpp | 46 +++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 6 +
.../AArch64/AArch64SelectionDAGInfo.cpp | 326 ++++++++++++++++++
llvm/test/CodeGen/AArch64/memmove-inline.ll | 98 ++++++
4 files changed, 476 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8f41f230b5521..2460921050229 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18502,6 +18502,52 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
return LLT();
}
+bool AArch64TargetLowering::findOptimalMemOpLowering(
+ LLVMContext &Context, std::vector<EVT> &MemOps, unsigned Limit,
+ const MemOp &Op, unsigned DstAS, unsigned SrcAS,
+ const AttributeList &FuncAttributes) const {
+ if (!Op.isMemset() && !Op.allowOverlap()) {
+ uint64_t Size = Op.size();
+ bool HandledSize = (Size >= 5 && Size <= 7) ||
+ (Size == 9) ||
+ (Size >= 11 && Size <= 15) ||
+ (Size >= 17 && Size <= 23) ||
+ (Size >= 25 && Size <= 31) ||
+ (Size >= 33 && Size <= 47) ||
+ (Size >= 49 && Size <= 63) ||
+ (Size == 65);
+
+ if (HandledSize) {
+ auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+ if (Op.isAligned(AlignCheck))
+ return true;
+ unsigned Fast;
+ return allowsMisalignedMemoryAccesses(
+ VT, DstAS, Align(1), MachineMemOperand::MONone, &Fast) &&
+ Fast;
+ };
+
+ // Check if we can use the appropriate type for this size range
+ bool CanHandle = false;
+ if (Size >= 5 && Size <= 7) {
+ CanHandle = AlignmentIsAcceptable(MVT::i32, Align(1));
+ } else if (Size >= 9 && Size <= 23) {
+ CanHandle = AlignmentIsAcceptable(MVT::i64, Align(1));
+ } else if (Size >= 25 && Size <= 65) {
+ CanHandle = AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+ AlignmentIsAcceptable(MVT::i64, Align(1));
+ }
+
+ if (CanHandle)
+ return false;
+ }
+ }
+
+ // Otherwise, use the default implementation
+ return TargetLowering::findOptimalMemOpLowering(Context, MemOps, Limit, Op,
+ DstAS, SrcAS, FuncAttributes);
+}
+
// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
if (Immed == std::numeric_limits<int64_t>::min()) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index be198e54cbcbf..4748835c47938 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -258,6 +258,12 @@ class AArch64TargetLowering : public TargetLowering {
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op,
const AttributeList &FuncAttributes) const override;
+ bool
+ findOptimalMemOpLowering(LLVMContext &Context, std::vector<EVT> &MemOps,
+ unsigned Limit, const MemOp &Op, unsigned DstAS,
+ unsigned SrcAS,
+ const AttributeList &FuncAttributes) const override;
+
LLT getOptimalMemOpLLT(const MemOp &Op,
const AttributeList &FuncAttributes) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 48e03ad853d26..60d20672c46cb 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -252,6 +252,332 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
RTLIB::MEMMOVE);
+
+ // Handle small memmove cases with overlapping loads/stores for better codegen
+ // For non-power-of-two sizes, use overlapping operations instead of
+ // mixed-size operations (e.g., for 7 bytes: two i32 loads/stores with overlap
+ // instead of i32 + i16 + i8)
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size)) {
+ uint64_t SizeVal = C->getZExtValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+ if (Alignment >= AlignCheck)
+ return true;
+ unsigned Fast;
+ return TLI.allowsMisalignedMemoryAccesses(
+ VT, DstPtrInfo.getAddrSpace(), Align(1),
+ MachineMemOperand::MONone, &Fast) &&
+ Fast;
+ };
+
+ MachineMemOperand::Flags MMOFlags =
+ isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+
+ // For sizes 5-7 bytes: use two overlapping i32 operations
+ if (SizeVal >= 5 && SizeVal <= 7) {
+ if (AlignmentIsAcceptable(MVT::i32, Align(1))) {
+ uint64_t SecondOffset = SizeVal - 4;
+
+ SDValue Load1 =
+ DAG.getLoad(MVT::i32, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+ SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Load2 = DAG.getLoad(
+ MVT::i32, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
+ SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+ Load2.getValue(1));
+
+ SDValue Store1 =
+ DAG.getStore(Chain, dl, Load1,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+ DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Store2 = DAG.getStore(
+ Chain, dl, Load2,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
+ DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+ }
+ }
+
+ // For sizes 9-15 bytes: use i64 + overlapping i64
+ if (SizeVal >= 9 && SizeVal <= 15) {
+ if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ uint64_t SecondOffset = SizeVal - 8;
+
+ SDValue Load1 =
+ DAG.getLoad(MVT::i64, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+ SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Load2 = DAG.getLoad(
+ MVT::i64, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
+ SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+ Load2.getValue(1));
+
+ SDValue Store1 =
+ DAG.getStore(Chain, dl, Load1,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+ DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Store2 = DAG.getStore(
+ Chain, dl, Load2,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
+ DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+ }
+ }
+
+ // For sizes 17-23 bytes: use i64 + i64 + overlapping i64
+ if (SizeVal >= 17 && SizeVal <= 23) {
+ if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ uint64_t ThirdOffset = SizeVal - 8;
+
+ SDValue Load1 =
+ DAG.getLoad(MVT::i64, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+ SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Load2 =
+ DAG.getLoad(MVT::i64, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(8)),
+ SrcPtrInfo.getWithOffset(8), Alignment, MMOFlags);
+
+ SDValue Load3 = DAG.getLoad(
+ MVT::i64, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)),
+ SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+ Load2.getValue(1), Load3.getValue(1));
+
+ SDValue Store1 =
+ DAG.getStore(Chain, dl, Load1,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+ DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Store2 =
+ DAG.getStore(Chain, dl, Load2,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(8)),
+ DstPtrInfo.getWithOffset(8), Alignment, MMOFlags);
+
+ SDValue Store3 = DAG.getStore(
+ Chain, dl, Load3,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)),
+ DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2,
+ Store3);
+ }
+ }
+
+ // For sizes 25-31 bytes: use v16i8 (vector) + overlapping i64
+ if (SizeVal >= 25 && SizeVal <= 31) {
+ if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+ AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ uint64_t SecondOffset = SizeVal - 8;
+
+ SDValue Load1 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+ SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Load2 = DAG.getLoad(
+ MVT::i64, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
+ SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+ Load2.getValue(1));
+
+ SDValue Store1 =
+ DAG.getStore(Chain, dl, Load1,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+ DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Store2 = DAG.getStore(
+ Chain, dl, Load2,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
+ DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+ }
+ }
+
+ // For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64
+ if (SizeVal >= 33 && SizeVal <= 47) {
+ if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+ AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ uint64_t ThirdOffset = SizeVal - 8;
+
+ SDValue Load1 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+ SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Load2 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)),
+ SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+
+ SDValue Load3 = DAG.getLoad(
+ MVT::i64, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)),
+ SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+ Load2.getValue(1), Load3.getValue(1));
+
+ SDValue Store1 =
+ DAG.getStore(Chain, dl, Load1,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+ DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Store2 = DAG.getStore(
+ Chain, dl, Load2,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)),
+ DstPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+
+ SDValue Store3 = DAG.getStore(
+ Chain, dl, Load3,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)),
+ DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2,
+ Store3);
+ }
+ }
+
+ // For sizes 49-63 bytes: use 3 x v16i8 (vectors) + overlapping i64
+ if (SizeVal >= 49 && SizeVal <= 63) {
+ if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+ AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ uint64_t FourthOffset = SizeVal - 8;
+
+ SDValue Load1 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+ SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Load2 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)),
+ SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+
+ SDValue Load3 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)),
+ SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags);
+
+ SDValue Load4 = DAG.getLoad(
+ MVT::i64, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(FourthOffset)),
+ SrcPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+ Load2.getValue(1), Load3.getValue(1),
+ Load4.getValue(1));
+
+ SDValue Store1 =
+ DAG.getStore(Chain, dl, Load1,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+ DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Store2 = DAG.getStore(
+ Chain, dl, Load2,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)),
+ DstPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+
+ SDValue Store3 = DAG.getStore(
+ Chain, dl, Load3,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)),
+ DstPtrInfo.getWithOffset(32), Alignment, MMOFlags);
+
+ SDValue Store4 = DAG.getStore(
+ Chain, dl, Load4,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(FourthOffset)),
+ DstPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2,
+ Store3, Store4);
+ }
+ }
+
+ // For size 65 bytes: use 4 x v16i8 (vectors) + overlapping i64
+ if (SizeVal == 65) {
+ if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+ AlignmentIsAcceptable(MVT::i64, Align(1))) {
+
+ SDValue Load1 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+ SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Load2 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)),
+ SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+
+ SDValue Load3 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)),
+ SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags);
+
+ SDValue Load4 =
+ DAG.getLoad(MVT::v16i8, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(48)),
+ SrcPtrInfo.getWithOffset(48), Alignment, MMOFlags);
+
+ SDValue Load5 =
+ DAG.getLoad(MVT::i64, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(57)),
+ SrcPtrInfo.getWithOffset(57), Alignment, MMOFlags);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+ Load2.getValue(1), Load3.getValue(1),
+ Load4.getValue(1), Load5.getValue(1));
+
+ SDValue Store1 =
+ DAG.getStore(Chain, dl, Load1,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+ DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+ SDValue Store2 = DAG.getStore(
+ Chain, dl, Load2,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)),
+ DstPtrInfo.getWithOffset(16), Alignment, MMOFlags);
+
+ SDValue Store3 = DAG.getStore(
+ Chain, dl, Load3,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)),
+ DstPtrInfo.getWithOffset(32), Alignment, MMOFlags);
+
+ SDValue Store4 = DAG.getStore(
+ Chain, dl, Load4,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(48)),
+ DstPtrInfo.getWithOffset(48), Alignment, MMOFlags);
+
+ SDValue Store5 = DAG.getStore(
+ Chain, dl, Load5,
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(57)),
+ DstPtrInfo.getWithOffset(57), Alignment, MMOFlags);
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2,
+ Store3, Store4, Store5);
+ }
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/memmove-inline.ll b/llvm/test/CodeGen/AArch64/memmove-inline.ll
index 641c48dd0f1c5..0ece0feda9da8 100644
--- a/llvm/test/CodeGen/AArch64/memmove-inline.ll
+++ b/llvm/test/CodeGen/AArch64/memmove-inline.ll
@@ -119,4 +119,102 @@ entry:
ret void
}
+; Test overlapping memmove optimization for non-power-of-two sizes
+; These should use overlapping loads/stores instead of mixed-size operations
+
+define void @move7(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move7:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldur w8, [x1, #3]
+; CHECK-ALIGNED-NEXT: ldr w9, [x1]
+; CHECK-ALIGNED-NEXT: stur w8, [x0, #3]
+; CHECK-ALIGNED-NEXT: str w9, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 7, i1 false)
+ ret void
+}
+
+define void @move13(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move13:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldur x8, [x1, #5]
+; CHECK-ALIGNED-NEXT: ldr x9, [x1]
+; CHECK-ALIGNED-NEXT: stur x8, [x0, #5]
+; CHECK-ALIGNED-NEXT: str x9, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 13, i1 false)
+ ret void
+}
+
+define void @move15(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move15:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldur x8, [x1, #7]
+; CHECK-ALIGNED-NEXT: ldr x9, [x1]
+; CHECK-ALIGNED-NEXT: stur x8, [x0, #7]
+; CHECK-ALIGNED-NEXT: str x9, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 15, i1 false)
+ ret void
+}
+
+define void @move25(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move25:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldur x8, [x1, #17]
+; CHECK-ALIGNED-NEXT: ldr q0, [x1]
+; CHECK-ALIGNED-NEXT: stur x8, [x0, #17]
+; CHECK-ALIGNED-NEXT: str q0, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 25, i1 false)
+ ret void
+}
+
+define void @move33(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move33:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldp q1, q0, [x1]
+; CHECK-ALIGNED-NEXT: ldur x8, [x1, #25]
+; CHECK-ALIGNED-NEXT: stur x8, [x0, #25]
+; CHECK-ALIGNED-NEXT: stp q1, q0, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 33, i1 false)
+ ret void
+}
+
+define void @move49(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move49:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldp q2, q0, [x1, #16]
+; CHECK-ALIGNED-NEXT: ldur x8, [x1, #41]
+; CHECK-ALIGNED-NEXT: ldr q1, [x1]
+; CHECK-ALIGNED-NEXT: stur x8, [x0, #41]
+; CHECK-ALIGNED-NEXT: stp q2, q0, [x0, #16]
+; CHECK-ALIGNED-NEXT: str q1, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 49, i1 false)
+ ret void
+}
+
+define void @move65(ptr %out, ptr %in) {
+; CHECK-ALIGNED-LABEL: move65:
+; CHECK-ALIGNED: // %bb.0: // %entry
+; CHECK-ALIGNED-NEXT: ldp q0, q1, [x1, #32]
+; CHECK-ALIGNED-NEXT: ldur x8, [x1, #57]
+; CHECK-ALIGNED-NEXT: ldp q2, q3, [x1]
+; CHECK-ALIGNED-NEXT: stur x8, [x0, #57]
+; CHECK-ALIGNED-NEXT: stp q0, q1, [x0, #32]
+; CHECK-ALIGNED-NEXT: stp q2, q3, [x0]
+; CHECK-ALIGNED-NEXT: ret
+entry:
+ call void @llvm.memmove.p0.p0.i64(ptr %out, ptr %in, i64 65, i1 false)
+ ret void
+}
+
declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
>From 3562902d341304fede356b8f0dad12375620f9a5 Mon Sep 17 00:00:00 2001
From: Osama Abdelkader <osama.abdelkader at gmail.com>
Date: Thu, 20 Nov 2025 18:59:35 +0200
Subject: [PATCH 2/2] AArch64: Optimize memmove for non-power-of-two sizes
refactoring
- Refactored repetitive code
- Simplified size check: Using binary arithmetic (Size & (Size - 1)) != 0
- Updated comments
Signed-off-by: Osama Abdelkader <osama.abdelkader at gmail.com>
---
.../Target/AArch64/AArch64ISelLowering.cpp | 14 +-
.../AArch64/AArch64SelectionDAGInfo.cpp | 416 +++++-------------
2 files changed, 125 insertions(+), 305 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2460921050229..d314db2407f14 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18508,16 +18508,10 @@ bool AArch64TargetLowering::findOptimalMemOpLowering(
const AttributeList &FuncAttributes) const {
if (!Op.isMemset() && !Op.allowOverlap()) {
uint64_t Size = Op.size();
- bool HandledSize = (Size >= 5 && Size <= 7) ||
- (Size == 9) ||
- (Size >= 11 && Size <= 15) ||
- (Size >= 17 && Size <= 23) ||
- (Size >= 25 && Size <= 31) ||
- (Size >= 33 && Size <= 47) ||
- (Size >= 49 && Size <= 63) ||
- (Size == 65);
-
- if (HandledSize) {
+ // Only handle non-power-of-two sizes > 4 and <= 65, excluding size 10
+ // which doesn't show improvement. Check if size is non-power-of-two:
+ // (Size & (Size - 1)) != 0
+ if (Size > 4 && Size <= 65 && Size != 10 && (Size & (Size - 1)) != 0) {
auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
if (Op.isAligned(AlignCheck))
return true;
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 60d20672c46cb..d89719add476f 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -12,6 +12,7 @@
#include "AArch64SelectionDAGInfo.h"
#include "AArch64MachineFunctionInfo.h"
+#include "llvm/ADT/ArrayRef.h"
#define GET_SDNODE_DESC
#include "AArch64GenSDNodeInfo.inc"
@@ -236,6 +237,44 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
return SDValue();
}
+// Helper function to generate overlapping loads/stores for memmove.
+// Takes a list of (EVT, offset) pairs for loads/stores and generates the DAG.
+static SDValue EmitOverlappingMemmove(
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+ ArrayRef<std::pair<EVT, uint64_t>> LoadOps, Align Alignment,
+ MachineMemOperand::Flags MMOFlags, MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo) {
+ SmallVector<SDValue, 8> Loads;
+ SmallVector<SDValue, 8> LoadChains;
+
+ // Generate all loads
+ for (const auto &[VT, Offset] : LoadOps) {
+ SDValue Load = DAG.getLoad(
+ VT, dl, Chain,
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(Offset)),
+ SrcPtrInfo.getWithOffset(Offset), Alignment, MMOFlags);
+ Loads.push_back(Load);
+ LoadChains.push_back(Load.getValue(1));
+ }
+
+ // Combine all load chains
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+
+ // Generate all stores
+ SmallVector<SDValue, 8> Stores;
+ for (size_t i = 0; i < LoadOps.size(); ++i) {
+ uint64_t Offset = LoadOps[i].second;
+ SDValue Store = DAG.getStore(
+ Chain, dl, Loads[i],
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(Offset)),
+ DstPtrInfo.getWithOffset(Offset), Alignment, MMOFlags);
+ Stores.push_back(Store);
+ }
+
+ // Combine all store chains
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+}
+
SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile,
@@ -256,7 +295,9 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
// Handle small memmove cases with overlapping loads/stores for better codegen
// For non-power-of-two sizes, use overlapping operations instead of
// mixed-size operations (e.g., for 7 bytes: two i32 loads/stores with overlap
- // instead of i32 + i16 + i8)
+ // instead of i32 + i16 + i8). This optimization provides significant
+ // improvement for most sizes, though some specific sizes (e.g., 33, 49, 65)
+ // may show less improvement than others in their range.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size)) {
uint64_t SizeVal = C->getZExtValue();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -274,306 +315,91 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
MachineMemOperand::Flags MMOFlags =
isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
- // For sizes 5-7 bytes: use two overlapping i32 operations
- if (SizeVal >= 5 && SizeVal <= 7) {
- if (AlignmentIsAcceptable(MVT::i32, Align(1))) {
- uint64_t SecondOffset = SizeVal - 4;
-
- SDValue Load1 =
- DAG.getLoad(MVT::i32, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
- SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Load2 = DAG.getLoad(
- MVT::i32, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
- SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
-
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
- Load2.getValue(1));
-
- SDValue Store1 =
- DAG.getStore(Chain, dl, Load1,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
- DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Store2 = DAG.getStore(
- Chain, dl, Load2,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
- DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
-
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+ // Only handle non-power-of-two sizes > 4 and <= 65
+ // Check if size is non-power-of-two: (Size & (Size - 1)) != 0
+ if (SizeVal > 4 && SizeVal <= 65 && (SizeVal & (SizeVal - 1)) != 0) {
+ SmallVector<std::pair<EVT, uint64_t>, 4> LoadOps;
+
+ // For sizes 5-7 bytes: use two overlapping i32 operations
+ if (SizeVal >= 5 && SizeVal <= 7) {
+ if (AlignmentIsAcceptable(MVT::i32, Align(1))) {
+ LoadOps.push_back({MVT::i32, 0});
+ LoadOps.push_back({MVT::i32, SizeVal - 4});
+ return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
+ Alignment, MMOFlags, DstPtrInfo,
+ SrcPtrInfo);
+ }
}
- }
-
- // For sizes 9-15 bytes: use i64 + overlapping i64
- if (SizeVal >= 9 && SizeVal <= 15) {
- if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
- uint64_t SecondOffset = SizeVal - 8;
-
- SDValue Load1 =
- DAG.getLoad(MVT::i64, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
- SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Load2 = DAG.getLoad(
- MVT::i64, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
- SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
-
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
- Load2.getValue(1));
-
- SDValue Store1 =
- DAG.getStore(Chain, dl, Load1,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
- DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Store2 = DAG.getStore(
- Chain, dl, Load2,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
- DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
-
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+ // For sizes 9-15 bytes: use i64 + overlapping i64
+ else if (SizeVal >= 9 && SizeVal <= 15) {
+ if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ LoadOps.push_back({MVT::i64, 0});
+ LoadOps.push_back({MVT::i64, SizeVal - 8});
+ return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
+ Alignment, MMOFlags, DstPtrInfo,
+ SrcPtrInfo);
+ }
}
- }
-
- // For sizes 17-23 bytes: use i64 + i64 + overlapping i64
- if (SizeVal >= 17 && SizeVal <= 23) {
- if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
- uint64_t ThirdOffset = SizeVal - 8;
-
- SDValue Load1 =
- DAG.getLoad(MVT::i64, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
- SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Load2 =
- DAG.getLoad(MVT::i64, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(8)),
- SrcPtrInfo.getWithOffset(8), Alignment, MMOFlags);
-
- SDValue Load3 = DAG.getLoad(
- MVT::i64, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)),
- SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
-
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
- Load2.getValue(1), Load3.getValue(1));
-
- SDValue Store1 =
- DAG.getStore(Chain, dl, Load1,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
- DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Store2 =
- DAG.getStore(Chain, dl, Load2,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(8)),
- DstPtrInfo.getWithOffset(8), Alignment, MMOFlags);
-
- SDValue Store3 = DAG.getStore(
- Chain, dl, Load3,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)),
- DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
-
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2,
- Store3);
+ // For sizes 17-23 bytes: use i64 + i64 + overlapping i64
+ else if (SizeVal >= 17 && SizeVal <= 23) {
+ if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ LoadOps.push_back({MVT::i64, 0});
+ LoadOps.push_back({MVT::i64, 8});
+ LoadOps.push_back({MVT::i64, SizeVal - 8});
+ return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
+ Alignment, MMOFlags, DstPtrInfo,
+ SrcPtrInfo);
+ }
}
- }
-
- // For sizes 25-31 bytes: use v16i8 (vector) + overlapping i64
- if (SizeVal >= 25 && SizeVal <= 31) {
- if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
- AlignmentIsAcceptable(MVT::i64, Align(1))) {
- uint64_t SecondOffset = SizeVal - 8;
-
- SDValue Load1 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
- SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Load2 = DAG.getLoad(
- MVT::i64, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
- SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
-
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
- Load2.getValue(1));
-
- SDValue Store1 =
- DAG.getStore(Chain, dl, Load1,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
- DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Store2 = DAG.getStore(
- Chain, dl, Load2,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
- DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
-
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+ // For sizes 25-31 bytes: use v16i8 (vector) + overlapping i64
+ else if (SizeVal >= 25 && SizeVal <= 31) {
+ if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+ AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ LoadOps.push_back({MVT::v16i8, 0});
+ LoadOps.push_back({MVT::i64, SizeVal - 8});
+ return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
+ Alignment, MMOFlags, DstPtrInfo,
+ SrcPtrInfo);
+ }
}
- }
-
- // For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64
- if (SizeVal >= 33 && SizeVal <= 47) {
- if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
- AlignmentIsAcceptable(MVT::i64, Align(1))) {
- uint64_t ThirdOffset = SizeVal - 8;
-
- SDValue Load1 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
- SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Load2 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)),
- SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags);
-
- SDValue Load3 = DAG.getLoad(
- MVT::i64, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)),
- SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
-
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
- Load2.getValue(1), Load3.getValue(1));
-
- SDValue Store1 =
- DAG.getStore(Chain, dl, Load1,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
- DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Store2 = DAG.getStore(
- Chain, dl, Load2,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)),
- DstPtrInfo.getWithOffset(16), Alignment, MMOFlags);
-
- SDValue Store3 = DAG.getStore(
- Chain, dl, Load3,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)),
- DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
-
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2,
- Store3);
+ // For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64
+ else if (SizeVal >= 33 && SizeVal <= 47) {
+ if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+ AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ LoadOps.push_back({MVT::v16i8, 0});
+ LoadOps.push_back({MVT::v16i8, 16});
+ LoadOps.push_back({MVT::i64, SizeVal - 8});
+ return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
+ Alignment, MMOFlags, DstPtrInfo,
+ SrcPtrInfo);
+ }
}
- }
-
- // For sizes 49-63 bytes: use 3 x v16i8 (vectors) + overlapping i64
- if (SizeVal >= 49 && SizeVal <= 63) {
- if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
- AlignmentIsAcceptable(MVT::i64, Align(1))) {
- uint64_t FourthOffset = SizeVal - 8;
-
- SDValue Load1 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
- SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Load2 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)),
- SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags);
-
- SDValue Load3 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)),
- SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags);
-
- SDValue Load4 = DAG.getLoad(
- MVT::i64, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(FourthOffset)),
- SrcPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags);
-
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
- Load2.getValue(1), Load3.getValue(1),
- Load4.getValue(1));
-
- SDValue Store1 =
- DAG.getStore(Chain, dl, Load1,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
- DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Store2 = DAG.getStore(
- Chain, dl, Load2,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)),
- DstPtrInfo.getWithOffset(16), Alignment, MMOFlags);
-
- SDValue Store3 = DAG.getStore(
- Chain, dl, Load3,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)),
- DstPtrInfo.getWithOffset(32), Alignment, MMOFlags);
-
- SDValue Store4 = DAG.getStore(
- Chain, dl, Load4,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(FourthOffset)),
- DstPtrInfo.getWithOffset(FourthOffset), Alignment, MMOFlags);
-
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2,
- Store3, Store4);
+ // For sizes 49-63 bytes: use 3 x v16i8 (vectors) + overlapping i64
+ else if (SizeVal >= 49 && SizeVal <= 63) {
+ if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+ AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ LoadOps.push_back({MVT::v16i8, 0});
+ LoadOps.push_back({MVT::v16i8, 16});
+ LoadOps.push_back({MVT::v16i8, 32});
+ LoadOps.push_back({MVT::i64, SizeVal - 8});
+ return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
+ Alignment, MMOFlags, DstPtrInfo,
+ SrcPtrInfo);
+ }
}
- }
-
- // For size 65 bytes: use 4 x v16i8 (vectors) + overlapping i64
- if (SizeVal == 65) {
- if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
- AlignmentIsAcceptable(MVT::i64, Align(1))) {
-
- SDValue Load1 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
- SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Load2 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(16)),
- SrcPtrInfo.getWithOffset(16), Alignment, MMOFlags);
-
- SDValue Load3 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(32)),
- SrcPtrInfo.getWithOffset(32), Alignment, MMOFlags);
-
- SDValue Load4 =
- DAG.getLoad(MVT::v16i8, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(48)),
- SrcPtrInfo.getWithOffset(48), Alignment, MMOFlags);
-
- SDValue Load5 =
- DAG.getLoad(MVT::i64, dl, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(57)),
- SrcPtrInfo.getWithOffset(57), Alignment, MMOFlags);
-
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
- Load2.getValue(1), Load3.getValue(1),
- Load4.getValue(1), Load5.getValue(1));
-
- SDValue Store1 =
- DAG.getStore(Chain, dl, Load1,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
- DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
-
- SDValue Store2 = DAG.getStore(
- Chain, dl, Load2,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(16)),
- DstPtrInfo.getWithOffset(16), Alignment, MMOFlags);
-
- SDValue Store3 = DAG.getStore(
- Chain, dl, Load3,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(32)),
- DstPtrInfo.getWithOffset(32), Alignment, MMOFlags);
-
- SDValue Store4 = DAG.getStore(
- Chain, dl, Load4,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(48)),
- DstPtrInfo.getWithOffset(48), Alignment, MMOFlags);
-
- SDValue Store5 = DAG.getStore(
- Chain, dl, Load5,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(57)),
- DstPtrInfo.getWithOffset(57), Alignment, MMOFlags);
-
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2,
- Store3, Store4, Store5);
+ // For size 65 bytes: use 4 x v16i8 (vectors) + overlapping i64
+ else if (SizeVal == 65) {
+ if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+ AlignmentIsAcceptable(MVT::i64, Align(1))) {
+ LoadOps.push_back({MVT::v16i8, 0});
+ LoadOps.push_back({MVT::v16i8, 16});
+ LoadOps.push_back({MVT::v16i8, 32});
+ LoadOps.push_back({MVT::v16i8, 48});
+ LoadOps.push_back({MVT::i64, SizeVal - 8});
+ return EmitOverlappingMemmove(DAG, dl, Chain, Dst, Src, LoadOps,
+ Alignment, MMOFlags, DstPtrInfo,
+ SrcPtrInfo);
+ }
}
}
}
More information about the llvm-commits
mailing list