[llvm] AArch64: Optimize memmove for non-power-of-two sizes (PR #168633)

Nashe Mncube via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 20 03:17:43 PST 2025


================
@@ -252,6 +252,332 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
   if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
     return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
                                              RTLIB::MEMMOVE);
+
+  // Handle small memmove cases with overlapping loads/stores for better codegen
+  // For non-power-of-two sizes, use overlapping operations instead of
+  // mixed-size operations (e.g., for 7 bytes: two i32 loads/stores with overlap
+  // instead of i32 + i16 + i8)
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t SizeVal = C->getZExtValue();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+      if (Alignment >= AlignCheck)
+        return true;
+      unsigned Fast;
+      return TLI.allowsMisalignedMemoryAccesses(
+                 VT, DstPtrInfo.getAddrSpace(), Align(1),
+                 MachineMemOperand::MONone, &Fast) &&
+             Fast;
+    };
+
+    MachineMemOperand::Flags MMOFlags =
+        isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+
+    // For sizes 5-7 bytes: use two overlapping i32 operations
+    if (SizeVal >= 5 && SizeVal <= 7) {
+      if (AlignmentIsAcceptable(MVT::i32, Align(1))) {
+        uint64_t SecondOffset = SizeVal - 4;
+
+        SDValue Load1 =
+            DAG.getLoad(MVT::i32, dl, Chain,
+                        DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+                        SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+        SDValue Load2 = DAG.getLoad(
+            MVT::i32, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
+            SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+                            Load2.getValue(1));
+
+        SDValue Store1 =
+            DAG.getStore(Chain, dl, Load1,
+                         DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+                         DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
+            DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+      }
+    }
+
+    // For sizes 9-15 bytes: use i64 + overlapping i64
+    if (SizeVal >= 9 && SizeVal <= 15) {
+      if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
+        uint64_t SecondOffset = SizeVal - 8;
+
+        SDValue Load1 =
+            DAG.getLoad(MVT::i64, dl, Chain,
+                        DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+                        SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+        SDValue Load2 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
+            SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+        
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+                            Load2.getValue(1));
+
+        SDValue Store1 =
+            DAG.getStore(Chain, dl, Load1,
+                         DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+                         DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
+            DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+      }
+    }
+    
+    // For sizes 17-23 bytes: use i64 + i64 + overlapping i64
+    if (SizeVal >= 17 && SizeVal <= 23) {
+      if (AlignmentIsAcceptable(MVT::i64, Align(1))) {
+        uint64_t ThirdOffset = SizeVal - 8;
+
+        SDValue Load1 =
+            DAG.getLoad(MVT::i64, dl, Chain,
+                        DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+                        SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+        SDValue Load2 =
+            DAG.getLoad(MVT::i64, dl, Chain,
+                        DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(8)),
+                        SrcPtrInfo.getWithOffset(8), Alignment, MMOFlags);
+
+        SDValue Load3 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(ThirdOffset)),
+            SrcPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+                            Load2.getValue(1), Load3.getValue(1));
+
+        SDValue Store1 =
+            DAG.getStore(Chain, dl, Load1,
+                         DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+                         DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+        SDValue Store2 =
+            DAG.getStore(Chain, dl, Load2,
+                         DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(8)),
+                         DstPtrInfo.getWithOffset(8), Alignment, MMOFlags);
+
+        SDValue Store3 = DAG.getStore(
+            Chain, dl, Load3,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(ThirdOffset)),
+            DstPtrInfo.getWithOffset(ThirdOffset), Alignment, MMOFlags);
+
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2,
+                           Store3);
+      }
+    }
+
+    // For sizes 25-31 bytes: use v16i8 (vector) + overlapping i64
+    if (SizeVal >= 25 && SizeVal <= 31) {
+      if (AlignmentIsAcceptable(MVT::v16i8, Align(1)) &&
+          AlignmentIsAcceptable(MVT::i64, Align(1))) {
+        uint64_t SecondOffset = SizeVal - 8;
+
+        SDValue Load1 =
+            DAG.getLoad(MVT::v16i8, dl, Chain,
+                        DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(0)),
+                        SrcPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+        SDValue Load2 = DAG.getLoad(
+            MVT::i64, dl, Chain,
+            DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SecondOffset)),
+            SrcPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1),
+                            Load2.getValue(1));
+
+        SDValue Store1 =
+            DAG.getStore(Chain, dl, Load1,
+                         DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(0)),
+                         DstPtrInfo.getWithOffset(0), Alignment, MMOFlags);
+
+        SDValue Store2 = DAG.getStore(
+            Chain, dl, Load2,
+            DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(SecondOffset)),
+            DstPtrInfo.getWithOffset(SecondOffset), Alignment, MMOFlags);
+
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+      }
+    }
+
+    // For sizes 33-47 bytes: use 2 x v16i8 (vectors) + overlapping i64
+    if (SizeVal >= 33 && SizeVal <= 47) {
----------------
nasherm wrote:

Looking at the codegen here https://godbolt.org/z/nEYvE5YY7

It doesn't seem like we're getting much of an improvement for cases where ```SizeVal > 32```

https://github.com/llvm/llvm-project/pull/168633


More information about the llvm-commits mailing list