[llvm] r286875 - [AArch64] Split 0 vector stores into scalar store pairs.

Mon Nov 14 11:39:05 PST 2016

Author: gberry
Date: Mon Nov 14 13:39:04 2016
New Revision: 286875

URL: http://llvm.org/viewvc/llvm-project?rev=286875&view=rev
Log:
[AArch64] Split 0 vector stores into scalar store pairs.

Summary:
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
The load store optimizer pass will merge them to store pair stores.
This should be better than a movi to create the vector zero followed by
a vector store if the zero constant is not re-used, since one
instructions and one register live range will be removed.

For example, the final generated code should be:

  stp xzr, xzr, [x0]

instead of:

  movi v0.2d, #0
  str q0, [x0]

Reviewers: t.p.northover, mcrosier, MatzeB, jmolloy

Subscribers: aemerson, rengolin, llvm-commits

Differential Revision: https://reviews.llvm.org/D26561

Modified:
    llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
    llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll
    llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll

Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=286875&r1=286874&r2=286875&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Mon Nov 14 13:39:04 2016
@@ -8799,6 +8799,61 @@ static SDValue split16BStoreSplat(Select
   return NewST1;
 }
 
+/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
+/// load store optimizer pass will merge them to store pair stores.  This should
+/// be better than a movi to create the vector zero followed by a vector store
+/// if the zero constant is not re-used, since one instructions and one register
+/// live range will be removed.
+///
+/// For example, the final generated code should be:
+///
+///   stp xzr, xzr, [x0]
+///
+/// instead of:
+///
+///   movi v0.2d, #0
+///   str q0, [x0]
+///
+static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+
+  // We can express a splat as store pair(s) for 2 or 4 elements.
+  int NumVecElts = VT.getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 2)
+    return SDValue();
+
+  if (StVal.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // If the zero constant has more than one use then the vector store could be
+  // better since the constant mov will be amortized and stp q instructions
+  // should be able to be formed.
+  if (!StVal.hasOneUse())
+    return SDValue();
+
+  // If the immediate offset of the address operand is too large for the stp
+  // instruction, then bail out.
+  if (DAG.isBaseWithConstantOffset(St->getBasePtr())) {
+    int64_t Offset = St->getBasePtr()->getConstantOperandVal(1);
+    if (Offset < -512 || Offset > 504)
+      return SDValue();
+  }
+
+  for (int I = 0; I < NumVecElts; ++I) {
+    SDValue EltVal = StVal.getOperand(I);
+    if (!isa<ConstantSDNode>(EltVal) ||
+        !cast<ConstantSDNode>(EltVal)->isNullValue())
+      return SDValue();
+  }
+  // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
+  // undoing this transformation.
+  return split16BStoreSplat(
+      DAG, St, NumVecElts == 4 ? DAG.getRegister(AArch64::WZR, MVT::i32)
+                               : DAG.getRegister(AArch64::XZR, MVT::i64),
+      NumVecElts);
+}
+
 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
 /// value. The load store optimizer pass will merge them to store pair stores.
 /// This has better performance than a splat of the scalar followed by a split
@@ -8862,6 +8917,17 @@ static SDValue split16BStores(SDNode *N,
   if (S->isVolatile())
     return SDValue();
 
+  SDValue StVal = S->getValue();
+  EVT VT = StVal.getValueType();
+  if (!VT.isVector())
+    return SDValue();
+
+  // If we get a splat of zeros, convert this vector store to a store of
+  // scalars. They will be merged into store pairs of xzr thereby removing one
+  // instruction and one register.
+  if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, S))
+    return ReplacedZeroSplat;
+
   // FIXME: The logic for deciding if an unaligned store should be split should
   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
   // a call to that function here.
@@ -8873,12 +8939,9 @@ static SDValue split16BStores(SDNode *N,
   if (DAG.getMachineFunction().getFunction()->optForMinSize())
     return SDValue();
 
-  SDValue StVal = S->getValue();
-  EVT VT = StVal.getValueType();
-
   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
   // those up regresses performance on micro-benchmarks and olden/bh.
-  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+  if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
     return SDValue();
 
   // Split unaligned 16B stores. They are terrible for performance.

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll?rev=286875&r1=286874&r2=286875&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll Mon Nov 14 13:39:04 2016
@@ -182,22 +182,22 @@ declare void @llvm.trap()
 ; CHECK: LD4Fourv2d
 ; CHECK: STRQui
 ; CHECK: ********** INTERVALS **********
-define void @testLdStConflict() {
+define void @testLdStConflict(<2 x i64> %v) {
 entry:
   br label %loop
 
 loop:
   %0 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8* null)
   %ptr = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr, align 4
   %ptr1 = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr1, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr1, align 4
   %ptr2 = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr2, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr2, align 4
   %ptr3 = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr3, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr3, align 4
   %ptr4 = bitcast i8* undef to <2 x i64>*
-  store <2 x i64> zeroinitializer, <2 x i64>* %ptr4, align 4
+  store <2 x i64> %v, <2 x i64>* %ptr4, align 4
   br label %loop
 }
 

Modified: llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll?rev=286875&r1=286874&r2=286875&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll Mon Nov 14 13:39:04 2016
@@ -1333,3 +1333,134 @@ for.body:
 end:
   ret void
 }
+
+; DAGCombiner::MergeConsecutiveStores merges this into a vector store,
+; replaceZeroVectorStore should split the vector store back into
+; scalar stores which should get merged by AArch64LoadStoreOptimizer.
+define void @merge_zr32(i32* %p) {
+; CHECK-LABEL: merge_zr32:
+; CHECK: // %entry
+; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i32 0, i32* %p
+  %p1 = getelementptr i32, i32* %p, i32 1
+  store i32 0, i32* %p1
+  ret void
+}
+
+; Same sa merge_zr32 but the merged stores should also get paried.
+define void @merge_zr32_2(i32* %p) {
+; CHECK-LABEL: merge_zr32_2:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i32 0, i32* %p
+  %p1 = getelementptr i32, i32* %p, i32 1
+  store i32 0, i32* %p1
+  %p2 = getelementptr i32, i32* %p, i64 2
+  store i32 0, i32* %p2
+  %p3 = getelementptr i32, i32* %p, i64 3
+  store i32 0, i32* %p3
+  ret void
+}
+
+; Like merge_zr32_2, but checking the largest allowed stp immediate offset.
+define void @merge_zr32_2_offset(i32* %p) {
+; CHECK-LABEL: merge_zr32_2_offset:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
+; CHECK-NEXT: ret
+entry:
+  %p0 = getelementptr i32, i32* %p, i32 126
+  store i32 0, i32* %p0
+  %p1 = getelementptr i32, i32* %p, i32 127
+  store i32 0, i32* %p1
+  %p2 = getelementptr i32, i32* %p, i64 128
+  store i32 0, i32* %p2
+  %p3 = getelementptr i32, i32* %p, i64 129
+  store i32 0, i32* %p3
+  ret void
+}
+
+; Like merge_zr32, but replaceZeroVectorStore should not split this
+; vector store since the address offset is too large for the stp
+; instruction.
+define void @no_merge_zr32_2_offset(i32* %p) {
+; CHECK-LABEL: no_merge_zr32_2_offset:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
+; CHECK-NEXT: ret
+entry:
+  %p0 = getelementptr i32, i32* %p, i32 1024
+  store i32 0, i32* %p0
+  %p1 = getelementptr i32, i32* %p, i32 1025
+  store i32 0, i32* %p1
+  %p2 = getelementptr i32, i32* %p, i64 1026
+  store i32 0, i32* %p2
+  %p3 = getelementptr i32, i32* %p, i64 1027
+  store i32 0, i32* %p3
+  ret void
+}
+
+; Like merge_zr32, but replaceZeroVectorStore should not split the
+; vector store since the zero constant vector has multiple uses, so we
+; err on the side that allows for stp q instruction generation.
+define void @merge_zr32_3(i32* %p) {
+; CHECK-LABEL: merge_zr32_3:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i32 0, i32* %p
+  %p1 = getelementptr i32, i32* %p, i32 1
+  store i32 0, i32* %p1
+  %p2 = getelementptr i32, i32* %p, i64 2
+  store i32 0, i32* %p2
+  %p3 = getelementptr i32, i32* %p, i64 3
+  store i32 0, i32* %p3
+  %p4 = getelementptr i32, i32* %p, i64 4
+  store i32 0, i32* %p4
+  %p5 = getelementptr i32, i32* %p, i64 5
+  store i32 0, i32* %p5
+  %p6 = getelementptr i32, i32* %p, i64 6
+  store i32 0, i32* %p6
+  %p7 = getelementptr i32, i32* %p, i64 7
+  store i32 0, i32* %p7
+  ret void
+}
+
+; Similar to merge_zr32, but for 64-bit values.
+define void @merge_zr64(i64* %p) {
+; CHECK-LABEL: merge_zr64:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i64 0, i64* %p
+  %p1 = getelementptr i64, i64* %p, i64 1
+  store i64 0, i64* %p1
+  ret void
+}
+
+; Similar to merge_zr32_3, replaceZeroVectorStore should not split the
+; vector store since the zero constant vector has multiple uses.
+define void @merge_zr64_2(i64* %p) {
+; CHECK-LABEL: merge_zr64_2:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+  store i64 0, i64* %p
+  %p1 = getelementptr i64, i64* %p, i64 1
+  store i64 0, i64* %p1
+  %p2 = getelementptr i64, i64* %p, i64 2
+  store i64 0, i64* %p2
+  %p3 = getelementptr i64, i64* %p, i64 3
+  store i64 0, i64* %p3
+  ret void
+}

Modified: llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll?rev=286875&r1=286874&r2=286875&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll Mon Nov 14 13:39:04 2016
@@ -10,11 +10,11 @@ declare void @llvm.memset.p0i8.i64(i8* n
 define i32 @main() local_unnamed_addr #1 {
 ; Make sure the stores happen in the correct order (the exact instructions could change).
 ; CHECK-LABEL: main:
+; CHECK: stp xzr, xzr, [sp, #72]
+; CHECK: str w9, [sp, #80]
 ; CHECK: str q0, [sp, #48]
 ; CHECK: ldr w8, [sp, #48]
-; CHECK: stur q1, [sp, #72]
 ; CHECK: str q0, [sp, #64]
-; CHECK: str w9, [sp, #80]
 
 for.body.lr.ph.i.i.i.i.i.i63:
   %b1 = alloca [10 x i32], align 16