[PATCH][AArch64] Use 8-byte load&store for inlined memcpy() on Cortex A53

Tue Jul 15 01:15:19 PDT 2014

Hi,

Basing on the following information from [this post][0] by James Molloy:

  * Our inline memcpy expansion pass is emitting "LDR q0, [..]; STR q0,
  [..]" pairs, which is less than ideal on A53. If we switched to
  emitting "LDP x0, x1, [..]; STP x0, x1, [..]", we'd get around 30%
  better inline memcpy performance on A53. A57 seems to deal well with
  the LDR q sequence.

I've made a patch (attached) that does this for Cortex-A53.  Please
take a look at it.

Best regards,
Sergey

0: http://article.gmane.org/gmane.comp.compilers.llvm.devel/74269
-------------- next part --------------
[AArch64] Use 8-byte load&store for inlined memcpy() on Cortex A53

Cortex-A53 executes memcpy inlined with 64-bit registers faster than
version that uses 128-bit registers. Prefer 64-bit registers for memcpy
inlining for Cortex-A53.
---
 lib/Target/AArch64/AArch64ISelLowering.cpp         | 36 ++++++++++++++--------
 lib/Target/AArch64/AArch64Subtarget.h              |  2 ++
 .../64bit-load-store-memcpy-on-cortex-a53.ll       | 20 ++++++++++++
 3 files changed, 46 insertions(+), 12 deletions(-)
 create mode 100644 test/CodeGen/AArch64/64bit-load-store-memcpy-on-cortex-a53.ll

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 07ff093..7e4ede7 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -367,8 +367,15 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+  // It's better to use 64-bit registers for Cortex-A53 for memcpy inlining.
+  // 128-bit registers are optimal for other CPUs. Thus maximum number of stores
+  // for Cortex-A53 must be twice as big as for other subtargets.
+  if (Subtarget->isCortexA53()) {
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 8;
+  } else {
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+  }
 
   setStackPointerRegisterToSaveRestore(AArch64::SP);
 
@@ -6153,17 +6160,22 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
                                                bool ZeroMemset,
                                                bool MemcpyStrSrc,
                                                MachineFunction &MF) const {
-  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
-  // instruction to materialize the v2i64 zero and one store (with restrictive
-  // addressing mode). Just do two i64 store of zero-registers.
-  bool Fast;
-  const Function *F = MF.getFunction();
-  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat) &&
-      (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
-    return MVT::f128;
+  // Cortex-A53 executes memcpy inlined with 64-bit registers faster than
+  // version that uses 128-bit registers. Prefer 64-bit registers for memcpy
+  // inlining for Cortex-A53.
+  if (!Subtarget->isCortexA53() || IsMemset) {
+    // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+    // instruction to materialize the v2i64 zero and one store (with restrictive
+    // addressing mode). Just do two i64 store of zero-registers.
+    bool Fast;
+    const Function *F = MF.getFunction();
+    if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+        !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                         Attribute::NoImplicitFloat) &&
+        (memOpAlign(SrcAlign, DstAlign, 16) ||
+         (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+      return MVT::f128;
+  }
 
   return Size >= 8 ? MVT::i64 : MVT::i32;
 }
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 52124f6..3a865af 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -99,6 +99,8 @@ public:
 
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
+  bool isCortexA53() const { return CPUString == "cortex-a53"; }
+
   bool isCyclone() const { return CPUString == "cyclone"; }
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
diff --git a/test/CodeGen/AArch64/64bit-load-store-memcpy-on-cortex-a53.ll b/test/CodeGen/AArch64/64bit-load-store-memcpy-on-cortex-a53.ll
new file mode 100644
index 0000000..788ae98
--- /dev/null
+++ b/test/CodeGen/AArch64/64bit-load-store-memcpy-on-cortex-a53.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu -mcpu=cortex-a53 | FileCheck %s
+
+; marked as external to prevent possible optimizations
+ at b = external global [4 x i32]
+
+define void @copy-16-bytes-with-8-byte-registers() {
+; CHECK-LABEL: @copy-16-bytes-with-8-byte-registers
+; CHECK: adrp
+; CHECK: add
+; CHECK: ldr x9
+; CHECK: str x9
+; CHECK: ldr x8
+; CHECK: str x8
+; CHECK: ret
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* bitcast ([4 x i32]* @b to i8*), i64 16, i32 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)