[llvm] 535ed62 - [AArch64] Add custom store lowering for 256 bit non-temporal stores.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 21 14:54:10 PST 2020
Author: Florian Hahn
Date: 2020-01-21T14:53:40-08:00
New Revision: 535ed62c5fcb9b8cd1e7abdd342277761aed29b7
URL: https://github.com/llvm/llvm-project/commit/535ed62c5fcb9b8cd1e7abdd342277761aed29b7
DIFF: https://github.com/llvm/llvm-project/commit/535ed62c5fcb9b8cd1e7abdd342277761aed29b7.diff
LOG: [AArch64] Add custom store lowering for 256 bit non-temporal stores.
Currently we fail to lower non-termporal stores for 256+ bit vectors
to STNPQ, because type legalization will split them up to 128 bit stores
and because there are no single non-temporal stores, creating STPNQ
in the Load/Store optimizer would be quite tricky.
This patch adds custom lowering for 256 bit non-temporal vector stores
to improve the generated code.
Reviewers: dmgreen, samparker, t.p.northover, ab
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D72919
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/nontemporal.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 38756a847e22..0e871c229204 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -525,6 +525,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::i128, Custom);
+ // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
+ // custom lowering, as there are no un-paired non-temporal stores and
+ // legalization will break up 256 bit inputs.
+ setOperationAction(ISD::STORE, MVT::v32i8, Custom);
+ setOperationAction(ISD::STORE, MVT::v16i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v16f16, Custom);
+ setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v8f32, Custom);
+ setOperationAction(ISD::STORE, MVT::v4f64, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i64, Custom);
+
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())
@@ -1382,6 +1393,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
case AArch64ISD::LDP: return "AArch64ISD::LDP";
case AArch64ISD::STP: return "AArch64ISD::STP";
+ case AArch64ISD::STNP: return "AArch64ISD::STNP";
}
return nullptr;
}
@@ -3070,6 +3082,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
if (StoreNode->isTruncatingStore()) {
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
}
+ // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
+ // the custom lowering, as there are no un-paired non-temporal stores and
+ // legalization will break up 256 bit inputs.
+ if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
+ MemVT.getVectorElementCount().Min % 2u == 0 &&
+ ((MemVT.getScalarSizeInBits() == 8u ||
+ MemVT.getScalarSizeInBits() == 16u ||
+ MemVT.getScalarSizeInBits() == 32u ||
+ MemVT.getScalarSizeInBits() == 64u))) {
+ SDValue Lo =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
+ MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+ StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
+ SDValue Hi = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, Dl,
+ MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+ StoreNode->getValue(),
+ DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
+ SDValue Result = DAG.getMemIntrinsicNode(
+ AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
+ {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+ StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+ return Result;
+ }
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
SDValue Lo =
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2377e0a07d14..5aaeebef3088 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -272,7 +272,8 @@ enum NodeType : unsigned {
STZ2G,
LDP,
- STP
+ STP,
+ STNP
};
} // end namespace AArch64ISD
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 5650d9140821..c2853da050f1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -245,6 +245,7 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
// Generates the general dynamic sequences, i.e.
// adrp x0, :tlsdesc:var
@@ -544,6 +545,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
@@ -2734,6 +2736,10 @@ defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
(STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
+def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)),
+ (STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>;
+
+
//---
// (Register offset)
diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
index d8785f845c29..241879ad5d5d 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -2,10 +2,7 @@
define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
; CHECK-LABEL: test_stnp_v4i64:
-; CHECK-NEXT: mov d[[HI1:[0-9]+]], v1[1]
-; CHECK-NEXT: mov d[[HI0:[0-9]+]], v0[1]
-; CHECK-NEXT: stnp d1, d[[HI1]], [x0, #16]
-; CHECK-NEXT: stnp d0, d[[HI0]], [x0]
+; CHECK-NEXT: stnp q0, q1, [x0]
; CHECK-NEXT: ret
store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
ret void
@@ -334,6 +331,149 @@ define void @test_stnp_v4f32_offset_alloca_2(<4 x float> %v) #0 {
ret void
}
+define void @test_stnp_v32i8(<32 x i8> %v, <32 x i8>* %ptr) {
+; CHECK-LABEL: _test_stnp_v32i8:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: stnp q0, q1, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ store <32 x i8> %v, <32 x i8>* %ptr, align 4, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v32i16(<32 x i16> %v, <32 x i16>* %ptr) {
+; CHECK-LABEL: _test_stnp_v32i16:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: stnp q2, q3, [x0, #32]
+; CHECK-NEXT: stnp q0, q1, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ store <32 x i16> %v, <32 x i16>* %ptr, align 4, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v32f16(<32 x half> %v, <32 x half>* %ptr) {
+; CHECK-LABEL: _test_stnp_v32f16:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: stnp q2, q3, [x0, #32]
+; CHECK-NEXT: stnp q0, q1, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ store <32 x half> %v, <32 x half>* %ptr, align 4, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v16i32(<16 x i32> %v, <16 x i32>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16i32:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: stnp q2, q3, [x0, #32]
+; CHECK-NEXT: stnp q0, q1, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ store <16 x i32> %v, <16 x i32>* %ptr, align 4, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v16f32(<16 x float> %v, <16 x float>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16f32:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: stnp q2, q3, [x0, #32]
+; CHECK-NEXT: stnp q0, q1, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ store <16 x float> %v, <16 x float>* %ptr, align 4, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v17f32(<17 x float> %v, <17 x float>* %ptr) {
+; CHECK-LABEL: _test_stnp_v17f32:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: ldr s16, [sp, #16]
+; CHECK-NEXT: mov.s v0[1], v1[0]
+; CHECK-NEXT: mov.s v4[1], v5[0]
+; CHECK-NEXT: ldr s1, [sp]
+; CHECK-NEXT: add x8, sp, #20
+; CHECK-NEXT: ld1.s { v16 }[1], [x8]
+; CHECK-NEXT: add x8, sp, #4
+; CHECK-NEXT: ld1.s { v1 }[1], [x8]
+; CHECK-NEXT: add x8, sp, #24
+; CHECK-NEXT: ld1.s { v16 }[2], [x8]
+; CHECK-NEXT: add x8, sp, #8
+; CHECK-NEXT: ld1.s { v1 }[2], [x8]
+; CHECK-NEXT: add x8, sp, #28
+; CHECK-NEXT: ld1.s { v16 }[3], [x8]
+; CHECK-NEXT: add x8, sp, #12
+; CHECK-NEXT: mov.s v0[2], v2[0]
+; CHECK-NEXT: ldr s2, [sp, #32]
+; CHECK-NEXT: mov.s v4[2], v6[0]
+; CHECK-NEXT: mov.s v0[3], v3[0]
+; CHECK-NEXT: mov.s v4[3], v7[0]
+; CHECK-NEXT: mov d3, v4[1]
+; CHECK-NEXT: mov d5, v0[1]
+; CHECK-NEXT: ld1.s { v1 }[3], [x8]
+; CHECK-NEXT: stnp d4, d3, [x0, #16]
+; CHECK-NEXT: stnp d0, d5, [x0]
+; CHECK-NEXT: mov d0, v16[1]
+; CHECK-NEXT: mov d3, v1[1]
+; CHECK-NEXT: stnp d16, d0, [x0, #48]
+; CHECK-NEXT: stnp d1, d3, [x0, #32]
+; CHECK-NEXT: str s2, [x0, #64]
+; CHECK-NEXT: ret
+
+entry:
+ store <17 x float> %v, <17 x float>* %ptr, align 4, !nontemporal !0
+ ret void
+}
+define void @test_stnp_v16i32_invalid_offset(<16 x i32> %v, <16 x i32>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16i32_invalid_offset:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: mov w8, #32000
+; CHECK-NEXT: mov w9, #32032
+; CHECK-NEXT: add x8, x0, x8
+; CHECK-NEXT: add x9, x0, x9
+; CHECK-NEXT: stnp q2, q3, [x9]
+; CHECK-NEXT: stnp q0, q1, [x8]
+; CHECK-NEXT: ret
+
+entry:
+ %gep = getelementptr <16 x i32>, <16 x i32>* %ptr, i32 500
+ store <16 x i32> %v, <16 x i32>* %gep, align 4, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v16f64(<16 x double> %v, <16 x double>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16f64:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: stnp q6, q7, [x0, #96]
+; CHECK-NEXT: stnp q4, q5, [x0, #64]
+; CHECK-NEXT: stnp q2, q3, [x0, #32]
+; CHECK-NEXT: stnp q0, q1, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ store <16 x double> %v, <16 x double>* %ptr, align 4, !nontemporal !0
+ ret void
+}
+
+define void @test_stnp_v16i64(<16 x i64> %v, <16 x i64>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16i64:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: stnp q6, q7, [x0, #96]
+; CHECK-NEXT: stnp q4, q5, [x0, #64]
+; CHECK-NEXT: stnp q2, q3, [x0, #32]
+; CHECK-NEXT: stnp q0, q1, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ store <16 x i64> %v, <16 x i64>* %ptr, align 4, !nontemporal !0
+ ret void
+}
+
!0 = !{ i32 1 }
attributes #0 = { nounwind }
More information about the llvm-commits
mailing list