[llvm] AArch64: Optimize memmove for non-power-of-two sizes (PR #168633)

Thu Nov 20 09:01:45 PST 2025

================
@@ -252,6 +252,332 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
   if (LowerToSMERoutines && !Attrs.hasNonStreamingInterfaceAndBody())
     return EmitStreamingCompatibleMemLibCall(DAG, dl, Chain, Dst, Src, Size,
                                              RTLIB::MEMMOVE);
+
+  // Handle small memmove cases with overlapping loads/stores for better codegen
+  // For non-power-of-two sizes, use overlapping operations instead of
+  // mixed-size operations (e.g., for 7 bytes: two i32 loads/stores with overlap
+  // instead of i32 + i16 + i8)
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t SizeVal = C->getZExtValue();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+      if (Alignment >= AlignCheck)
+        return true;
+      unsigned Fast;
+      return TLI.allowsMisalignedMemoryAccesses(
+                 VT, DstPtrInfo.getAddrSpace(), Align(1),
+                 MachineMemOperand::MONone, &Fast) &&
+             Fast;
+    };
+
+    MachineMemOperand::Flags MMOFlags =
+        isVolatile ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+
+    // For sizes 5-7 bytes: use two overlapping i32 operations
+    if (SizeVal >= 5 && SizeVal <= 7) {
----------------
osamakader wrote:

Done.

https://github.com/llvm/llvm-project/pull/168633