[llvm] r230788 - [PowerPC] Use vector types for memcpy and friends (sometimes)

Fri Feb 27 11:58:28 PST 2015

Author: hfinkel
Date: Fri Feb 27 13:58:28 2015
New Revision: 230788

URL: http://llvm.org/viewvc/llvm-project?rev=230788&view=rev
Log:
[PowerPC] Use vector types for memcpy and friends (sometimes)

When using Altivec, we can use vector loads and stores for aligned memcpy and
friends. Starting with the P7 and VXS, we have reasonable unaligned vector
stores. Starting with the P8, we have fast unaligned loads too.

For QPX, we use vector loads are stores, but only for aligned memory accesses.

Added:
    llvm/trunk/test/CodeGen/PowerPC/memcpy-vec.ll
Modified:
    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/trunk/test/CodeGen/PowerPC/ppc64-byval-align.ll

Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp?rev=230788&r1=230787&r2=230788&view=diff
==============================================================================

--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp Fri Feb 27 13:58:28 2015
@@ -892,6 +892,13 @@ PPCTargetLowering::PPCTargetLowering(con
     MaxStoresPerMemcpyOptSize = 8;
     MaxStoresPerMemmove = 32;
     MaxStoresPerMemmoveOptSize = 8;
+  } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) {
+    // The A2 also benefits from (very) aggressive inlining of memcpy and
+    // friends. The overhead of a the function call, even when warm, can be
+    // over one hundred cycles.
+    MaxStoresPerMemset = 128;
+    MaxStoresPerMemcpy = 128;
+    MaxStoresPerMemmove = 128;
   }
 }
 
@@ -10914,11 +10921,27 @@ EVT PPCTargetLowering::getOptimalMemOpTy
                                            bool IsMemset, bool ZeroMemset,
                                            bool MemcpyStrSrc,
                                            MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+  // When expanding a memset, require at least two QPX instructions to cover
+  // the cost of loading the value to be stored from the constant pool.
+  if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
+     (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
+      !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+    return MVT::v4f64;
+  }
+
+  // We should use Altivec/VSX loads and stores when available. For unaligned
+  // addresses, unaligned VSX loads are only fast starting with the P8.
+  if (Subtarget.hasAltivec() && Size >= 16 &&
+      (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) ||
+       ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
+    return MVT::v4i32;
+
   if (Subtarget.isPPC64()) {
     return MVT::i64;
-  } else {
-    return MVT::i32;
   }
+
+  return MVT::i32;
 }
 
 /// \brief Returns true if it is beneficial to convert a load of a constant

Added: llvm/trunk/test/CodeGen/PowerPC/memcpy-vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/memcpy-vec.ll?rev=230788&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/memcpy-vec.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/memcpy-vec.ll Fri Feb 27 13:58:28 2015
@@ -0,0 +1,110 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PWR7
+; RUN: llc -mcpu=pwr8 < %s | FileCheck %s -check-prefix=PWR8
+; RUN: llc -mcpu=a2q < %s | FileCheck %s -check-prefix=A2Q
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo1(double* nocapture %x, double* nocapture readonly %y) #0 {
+entry:
+  %0 = bitcast double* %x to i8*
+  %1 = bitcast double* %y to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 8, i1 false)
+  ret void
+
+; PWR7-LABEL: @foo1
+; PWR7-NOT: bl memcpy
+; PWR7: ld {{[0-9]+}}, {{[0-9]+}}(4)
+; PWR7: std {{[0-9]+}}, {{[0-9]+}}(3)
+; PWR7: blr
+
+; PWR8-LABEL: @foo1
+; PWR8: lxvw4x
+; PWR8: stxvw4x
+; PWR8: blr
+
+; A2Q-LABEL: @foo1
+; A2Q-NOT: bl memcpy
+; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4)
+; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3)
+; A2Q: blr
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+; Function Attrs: nounwind
+define void @foo2(double* nocapture %x, double* nocapture readonly %y) #0 {
+entry:
+  %0 = bitcast double* %x to i8*
+  %1 = bitcast double* %y to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 128, i32 8, i1 false)
+  ret void
+
+; PWR7-LABEL: @foo2
+; PWR7: bl memcpy
+; PWR7: blr
+
+; PWR8-LABEL: @foo2
+; PWR8: lxvw4x
+; PWR8: stxvw4x
+; PWR8: blr
+
+; A2Q-LABEL: @foo2
+; A2Q-NOT: bl memcpy
+; A2Q: ld {{[0-9]+}}, {{[0-9]+}}(4)
+; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3)
+; A2Q: blr
+}
+
+; Function Attrs: nounwind
+define void @bar1(double* nocapture %x) #0 {
+entry:
+  %0 = bitcast double* %x to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 128, i32 8, i1 false)
+  ret void
+
+; PWR7-LABEL: @bar1
+; PWR7-NOT: bl memset
+; PWR7: stxvw4x
+; PWR7: blr
+
+; PWR8-LABEL: @bar1
+; PWR8-NOT: bl memset
+; PWR8: stxvw4x
+; PWR8: blr
+
+; A2Q-LABEL: @bar1
+; A2Q-NOT: bl memset
+; A2Q: std {{[0-9]+}}, {{[0-9]+}}(3)
+; A2Q: blr
+}
+
+; Function Attrs: nounwind
+define void @bar2(double* nocapture %x) #0 {
+entry:
+  %0 = bitcast double* %x to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 128, i32 32, i1 false)
+  ret void
+
+; PWR7-LABEL: @bar2
+; PWR7-NOT: bl memset
+; PWR7: stxvw4x
+; PWR7: blr
+
+; PWR8-LABEL: @bar2
+; PWR8-NOT: bl memset
+; PWR8: stxvw4x
+; PWR8: blr
+
+; A2Q-LABEL: @bar2
+; A2Q-NOT: bl memset
+; A2Q: qvstfdx
+; A2Q: blr
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+

Modified: llvm/trunk/test/CodeGen/PowerPC/ppc64-byval-align.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/ppc64-byval-align.ll?rev=230788&r1=230787&r2=230788&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/ppc64-byval-align.ll (original)
+++ llvm/trunk/test/CodeGen/PowerPC/ppc64-byval-align.ll Fri Feb 27 13:58:28 2015
@@ -50,7 +50,9 @@ entry:
 }
 ; CHECK-LABEL: @caller2
 ; CHECK: std 3, [[OFF:[0-9]+]](1)
-; CHECK: ld [[REG:[0-9]+]], [[OFF]](1)
-; CHECK: std [[REG]], 128(1)
+; CHECK: addi [[REG1:[0-9]+]], 1, [[OFF]]
+; CHECK: lxvw4x [[REG2:[0-9]+]], 0, [[REG1]]
+; CHECK: li [[REG3:[0-9]+]], 128
+; CHECK: stxvw4x 0, 1, [[REG3]]
 ; CHECK: bl test2