[llvm] r286875 - [AArch64] Split 0 vector stores into scalar store pairs.
Geoff Berry via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 14 11:39:05 PST 2016
Author: gberry
Date: Mon Nov 14 13:39:04 2016
New Revision: 286875
URL: http://llvm.org/viewvc/llvm-project?rev=286875&view=rev
Log:
[AArch64] Split 0 vector stores into scalar store pairs.
Summary:
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
The load store optimizer pass will merge them to store pair stores.
This should be better than a movi to create the vector zero followed by
a vector store if the zero constant is not re-used, since one
instructions and one register live range will be removed.
For example, the final generated code should be:
stp xzr, xzr, [x0]
instead of:
movi v0.2d, #0
str q0, [x0]
Reviewers: t.p.northover, mcrosier, MatzeB, jmolloy
Subscribers: aemerson, rengolin, llvm-commits
Differential Revision: https://reviews.llvm.org/D26561
Modified:
llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll
llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll
Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=286875&r1=286874&r2=286875&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Mon Nov 14 13:39:04 2016
@@ -8799,6 +8799,61 @@ static SDValue split16BStoreSplat(Select
return NewST1;
}
+/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
+/// load store optimizer pass will merge them to store pair stores. This should
+/// be better than a movi to create the vector zero followed by a vector store
+/// if the zero constant is not re-used, since one instructions and one register
+/// live range will be removed.
+///
+/// For example, the final generated code should be:
+///
+/// stp xzr, xzr, [x0]
+///
+/// instead of:
+///
+/// movi v0.2d, #0
+/// str q0, [x0]
+///
+static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+ SDValue StVal = St->getValue();
+ EVT VT = StVal.getValueType();
+
+ // We can express a splat as store pair(s) for 2 or 4 elements.
+ int NumVecElts = VT.getVectorNumElements();
+ if (NumVecElts != 4 && NumVecElts != 2)
+ return SDValue();
+
+ if (StVal.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ // If the zero constant has more than one use then the vector store could be
+ // better since the constant mov will be amortized and stp q instructions
+ // should be able to be formed.
+ if (!StVal.hasOneUse())
+ return SDValue();
+
+ // If the immediate offset of the address operand is too large for the stp
+ // instruction, then bail out.
+ if (DAG.isBaseWithConstantOffset(St->getBasePtr())) {
+ int64_t Offset = St->getBasePtr()->getConstantOperandVal(1);
+ if (Offset < -512 || Offset > 504)
+ return SDValue();
+ }
+
+ for (int I = 0; I < NumVecElts; ++I) {
+ SDValue EltVal = StVal.getOperand(I);
+ if (!isa<ConstantSDNode>(EltVal) ||
+ !cast<ConstantSDNode>(EltVal)->isNullValue())
+ return SDValue();
+ }
+ // Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
+ // undoing this transformation.
+ return split16BStoreSplat(
+ DAG, St, NumVecElts == 4 ? DAG.getRegister(AArch64::WZR, MVT::i32)
+ : DAG.getRegister(AArch64::XZR, MVT::i64),
+ NumVecElts);
+}
+
/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
/// value. The load store optimizer pass will merge them to store pair stores.
/// This has better performance than a splat of the scalar followed by a split
@@ -8862,6 +8917,17 @@ static SDValue split16BStores(SDNode *N,
if (S->isVolatile())
return SDValue();
+ SDValue StVal = S->getValue();
+ EVT VT = StVal.getValueType();
+ if (!VT.isVector())
+ return SDValue();
+
+ // If we get a splat of zeros, convert this vector store to a store of
+ // scalars. They will be merged into store pairs of xzr thereby removing one
+ // instruction and one register.
+ if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, S))
+ return ReplacedZeroSplat;
+
// FIXME: The logic for deciding if an unaligned store should be split should
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
// a call to that function here.
@@ -8873,12 +8939,9 @@ static SDValue split16BStores(SDNode *N,
if (DAG.getMachineFunction().getFunction()->optForMinSize())
return SDValue();
- SDValue StVal = S->getValue();
- EVT VT = StVal.getValueType();
-
// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
// those up regresses performance on micro-benchmarks and olden/bh.
- if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+ if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
return SDValue();
// Split unaligned 16B stores. They are terrible for performance.
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll?rev=286875&r1=286874&r2=286875&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll Mon Nov 14 13:39:04 2016
@@ -182,22 +182,22 @@ declare void @llvm.trap()
; CHECK: LD4Fourv2d
; CHECK: STRQui
; CHECK: ********** INTERVALS **********
-define void @testLdStConflict() {
+define void @testLdStConflict(<2 x i64> %v) {
entry:
br label %loop
loop:
%0 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i8(i8* null)
%ptr = bitcast i8* undef to <2 x i64>*
- store <2 x i64> zeroinitializer, <2 x i64>* %ptr, align 4
+ store <2 x i64> %v, <2 x i64>* %ptr, align 4
%ptr1 = bitcast i8* undef to <2 x i64>*
- store <2 x i64> zeroinitializer, <2 x i64>* %ptr1, align 4
+ store <2 x i64> %v, <2 x i64>* %ptr1, align 4
%ptr2 = bitcast i8* undef to <2 x i64>*
- store <2 x i64> zeroinitializer, <2 x i64>* %ptr2, align 4
+ store <2 x i64> %v, <2 x i64>* %ptr2, align 4
%ptr3 = bitcast i8* undef to <2 x i64>*
- store <2 x i64> zeroinitializer, <2 x i64>* %ptr3, align 4
+ store <2 x i64> %v, <2 x i64>* %ptr3, align 4
%ptr4 = bitcast i8* undef to <2 x i64>*
- store <2 x i64> zeroinitializer, <2 x i64>* %ptr4, align 4
+ store <2 x i64> %v, <2 x i64>* %ptr4, align 4
br label %loop
}
Modified: llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll?rev=286875&r1=286874&r2=286875&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/ldst-opt.ll Mon Nov 14 13:39:04 2016
@@ -1333,3 +1333,134 @@ for.body:
end:
ret void
}
+
+; DAGCombiner::MergeConsecutiveStores merges this into a vector store,
+; replaceZeroVectorStore should split the vector store back into
+; scalar stores which should get merged by AArch64LoadStoreOptimizer.
+define void @merge_zr32(i32* %p) {
+; CHECK-LABEL: merge_zr32:
+; CHECK: // %entry
+; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+ store i32 0, i32* %p
+ %p1 = getelementptr i32, i32* %p, i32 1
+ store i32 0, i32* %p1
+ ret void
+}
+
+; Same sa merge_zr32 but the merged stores should also get paried.
+define void @merge_zr32_2(i32* %p) {
+; CHECK-LABEL: merge_zr32_2:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+ store i32 0, i32* %p
+ %p1 = getelementptr i32, i32* %p, i32 1
+ store i32 0, i32* %p1
+ %p2 = getelementptr i32, i32* %p, i64 2
+ store i32 0, i32* %p2
+ %p3 = getelementptr i32, i32* %p, i64 3
+ store i32 0, i32* %p3
+ ret void
+}
+
+; Like merge_zr32_2, but checking the largest allowed stp immediate offset.
+define void @merge_zr32_2_offset(i32* %p) {
+; CHECK-LABEL: merge_zr32_2_offset:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
+; CHECK-NEXT: ret
+entry:
+ %p0 = getelementptr i32, i32* %p, i32 126
+ store i32 0, i32* %p0
+ %p1 = getelementptr i32, i32* %p, i32 127
+ store i32 0, i32* %p1
+ %p2 = getelementptr i32, i32* %p, i64 128
+ store i32 0, i32* %p2
+ %p3 = getelementptr i32, i32* %p, i64 129
+ store i32 0, i32* %p3
+ ret void
+}
+
+; Like merge_zr32, but replaceZeroVectorStore should not split this
+; vector store since the address offset is too large for the stp
+; instruction.
+define void @no_merge_zr32_2_offset(i32* %p) {
+; CHECK-LABEL: no_merge_zr32_2_offset:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
+; CHECK-NEXT: ret
+entry:
+ %p0 = getelementptr i32, i32* %p, i32 1024
+ store i32 0, i32* %p0
+ %p1 = getelementptr i32, i32* %p, i32 1025
+ store i32 0, i32* %p1
+ %p2 = getelementptr i32, i32* %p, i64 1026
+ store i32 0, i32* %p2
+ %p3 = getelementptr i32, i32* %p, i64 1027
+ store i32 0, i32* %p3
+ ret void
+}
+
+; Like merge_zr32, but replaceZeroVectorStore should not split the
+; vector store since the zero constant vector has multiple uses, so we
+; err on the side that allows for stp q instruction generation.
+define void @merge_zr32_3(i32* %p) {
+; CHECK-LABEL: merge_zr32_3:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+ store i32 0, i32* %p
+ %p1 = getelementptr i32, i32* %p, i32 1
+ store i32 0, i32* %p1
+ %p2 = getelementptr i32, i32* %p, i64 2
+ store i32 0, i32* %p2
+ %p3 = getelementptr i32, i32* %p, i64 3
+ store i32 0, i32* %p3
+ %p4 = getelementptr i32, i32* %p, i64 4
+ store i32 0, i32* %p4
+ %p5 = getelementptr i32, i32* %p, i64 5
+ store i32 0, i32* %p5
+ %p6 = getelementptr i32, i32* %p, i64 6
+ store i32 0, i32* %p6
+ %p7 = getelementptr i32, i32* %p, i64 7
+ store i32 0, i32* %p7
+ ret void
+}
+
+; Similar to merge_zr32, but for 64-bit values.
+define void @merge_zr64(i64* %p) {
+; CHECK-LABEL: merge_zr64:
+; CHECK: // %entry
+; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+ store i64 0, i64* %p
+ %p1 = getelementptr i64, i64* %p, i64 1
+ store i64 0, i64* %p1
+ ret void
+}
+
+; Similar to merge_zr32_3, replaceZeroVectorStore should not split the
+; vector store since the zero constant vector has multiple uses.
+define void @merge_zr64_2(i64* %p) {
+; CHECK-LABEL: merge_zr64_2:
+; CHECK: // %entry
+; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+entry:
+ store i64 0, i64* %p
+ %p1 = getelementptr i64, i64* %p, i64 1
+ store i64 0, i64* %p1
+ %p2 = getelementptr i64, i64* %p, i64 2
+ store i64 0, i64* %p2
+ %p3 = getelementptr i64, i64* %p, i64 3
+ store i64 0, i64* %p3
+ ret void
+}
Modified: llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll?rev=286875&r1=286874&r2=286875&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/ldst-paired-aliasing.ll Mon Nov 14 13:39:04 2016
@@ -10,11 +10,11 @@ declare void @llvm.memset.p0i8.i64(i8* n
define i32 @main() local_unnamed_addr #1 {
; Make sure the stores happen in the correct order (the exact instructions could change).
; CHECK-LABEL: main:
+; CHECK: stp xzr, xzr, [sp, #72]
+; CHECK: str w9, [sp, #80]
; CHECK: str q0, [sp, #48]
; CHECK: ldr w8, [sp, #48]
-; CHECK: stur q1, [sp, #72]
; CHECK: str q0, [sp, #64]
-; CHECK: str w9, [sp, #80]
for.body.lr.ph.i.i.i.i.i.i63:
%b1 = alloca [10 x i32], align 16
More information about the llvm-commits
mailing list