[llvm] 535ed62 - [AArch64] Add custom store lowering for 256 bit non-temporal stores.

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 21 14:54:10 PST 2020


Author: Florian Hahn
Date: 2020-01-21T14:53:40-08:00
New Revision: 535ed62c5fcb9b8cd1e7abdd342277761aed29b7

URL: https://github.com/llvm/llvm-project/commit/535ed62c5fcb9b8cd1e7abdd342277761aed29b7
DIFF: https://github.com/llvm/llvm-project/commit/535ed62c5fcb9b8cd1e7abdd342277761aed29b7.diff

LOG: [AArch64] Add custom store lowering for 256 bit non-temporal stores.

Currently we fail to lower non-termporal stores for 256+ bit vectors
to STNPQ, because type legalization will split them up to 128 bit stores
and because there are no single non-temporal stores, creating STPNQ
in the Load/Store optimizer would be quite tricky.

This patch adds custom lowering for 256 bit non-temporal vector stores
to improve the generated code.

Reviewers: dmgreen, samparker, t.p.northover, ab

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D72919

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/nontemporal.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 38756a847e22..0e871c229204 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -525,6 +525,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::LOAD, MVT::i128, Custom);
   setOperationAction(ISD::STORE, MVT::i128, Custom);
 
+  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
+  // custom lowering, as there are no un-paired non-temporal stores and
+  // legalization will break up 256 bit inputs.
+  setOperationAction(ISD::STORE, MVT::v32i8, Custom);
+  setOperationAction(ISD::STORE, MVT::v16i16, Custom);
+  setOperationAction(ISD::STORE, MVT::v16f16, Custom);
+  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v8f32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4f64, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i64, Custom);
+
   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
   // This requires the Performance Monitors extension.
   if (Subtarget->hasPerfMon())
@@ -1382,6 +1393,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::SST1_IMM:          return "AArch64ISD::SST1_IMM";
   case AArch64ISD::LDP:               return "AArch64ISD::LDP";
   case AArch64ISD::STP:               return "AArch64ISD::STP";
+  case AArch64ISD::STNP:              return "AArch64ISD::STNP";
   }
   return nullptr;
 }
@@ -3070,6 +3082,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
     if (StoreNode->isTruncatingStore()) {
       return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
     }
+    // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
+    // the custom lowering, as there are no un-paired non-temporal stores and
+    // legalization will break up 256 bit inputs.
+    if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
+        MemVT.getVectorElementCount().Min % 2u == 0 &&
+        ((MemVT.getScalarSizeInBits() == 8u ||
+          MemVT.getScalarSizeInBits() == 16u ||
+          MemVT.getScalarSizeInBits() == 32u ||
+          MemVT.getScalarSizeInBits() == 64u))) {
+      SDValue Lo =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
+                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+                      StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
+      SDValue Hi = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, Dl,
+          MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+          StoreNode->getValue(),
+          DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
+      SDValue Result = DAG.getMemIntrinsicNode(
+          AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
+          {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+          StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+      return Result;
+    }
   } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
     assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
     SDValue Lo =

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2377e0a07d14..5aaeebef3088 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -272,7 +272,8 @@ enum NodeType : unsigned {
   STZ2G,
 
   LDP,
-  STP
+  STP,
+  STNP
 };
 
 } // end namespace AArch64ISD

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 5650d9140821..c2853da050f1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -245,6 +245,7 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
 
 def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 
 // Generates the general dynamic sequences, i.e.
 //  adrp  x0, :tlsdesc:var
@@ -544,6 +545,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
 
 def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
 
@@ -2734,6 +2736,10 @@ defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
 def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
           (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
 
+def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)),
+          (STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>;
+
+
 //---
 // (Register offset)
 

diff  --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
index d8785f845c29..241879ad5d5d 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -2,10 +2,7 @@
 
 define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
 ; CHECK-LABEL: test_stnp_v4i64:
-; CHECK-NEXT:  mov d[[HI1:[0-9]+]], v1[1]
-; CHECK-NEXT:  mov d[[HI0:[0-9]+]], v0[1]
-; CHECK-NEXT:  stnp d1, d[[HI1]], [x0, #16]
-; CHECK-NEXT:  stnp d0, d[[HI0]], [x0]
+; CHECK-NEXT:  stnp q0, q1, [x0]
 ; CHECK-NEXT:  ret
   store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
   ret void
@@ -334,6 +331,149 @@ define void @test_stnp_v4f32_offset_alloca_2(<4 x float> %v) #0 {
   ret void
 }
 
+define void @test_stnp_v32i8(<32 x i8> %v, <32 x i8>* %ptr) {
+; CHECK-LABEL: _test_stnp_v32i8:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    stnp    q0, q1, [x0]
+; CHECK-NEXT:    ret
+
+entry:
+  store <32 x i8> %v, <32 x i8>* %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v32i16(<32 x i16> %v, <32 x i16>* %ptr) {
+; CHECK-LABEL: _test_stnp_v32i16:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
+; CHECK-NEXT:    stnp    q0, q1, [x0]
+; CHECK-NEXT:    ret
+
+entry:
+  store <32 x i16> %v, <32 x i16>* %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v32f16(<32 x half> %v, <32 x half>* %ptr) {
+; CHECK-LABEL: _test_stnp_v32f16:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
+; CHECK-NEXT:    stnp    q0, q1, [x0]
+; CHECK-NEXT:    ret
+
+entry:
+  store <32 x half> %v, <32 x half>* %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v16i32(<16 x i32> %v, <16 x i32>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16i32:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
+; CHECK-NEXT:    stnp    q0, q1, [x0]
+; CHECK-NEXT:    ret
+
+entry:
+  store <16 x i32> %v, <16 x i32>* %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v16f32(<16 x float> %v, <16 x float>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16f32:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
+; CHECK-NEXT:    stnp    q0, q1, [x0]
+; CHECK-NEXT:    ret
+
+entry:
+  store <16 x float> %v, <16 x float>* %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v17f32(<17 x float> %v, <17 x float>* %ptr) {
+; CHECK-LABEL: _test_stnp_v17f32:
+; CHECK-NEXT:	.cfi_startproc
+; CHECK-NEXT:	ldr	s16, [sp, #16]
+; CHECK-NEXT:	mov.s	v0[1], v1[0]
+; CHECK-NEXT:	mov.s	v4[1], v5[0]
+; CHECK-NEXT:	ldr	s1, [sp]
+; CHECK-NEXT:	add	x8, sp, #20
+; CHECK-NEXT:	ld1.s	{ v16 }[1], [x8]
+; CHECK-NEXT:	add	x8, sp, #4
+; CHECK-NEXT:	ld1.s	{ v1 }[1], [x8]
+; CHECK-NEXT:	add	x8, sp, #24
+; CHECK-NEXT:	ld1.s	{ v16 }[2], [x8]
+; CHECK-NEXT:	add	x8, sp, #8
+; CHECK-NEXT:	ld1.s	{ v1 }[2], [x8]
+; CHECK-NEXT:	add	x8, sp, #28
+; CHECK-NEXT:	ld1.s	{ v16 }[3], [x8]
+; CHECK-NEXT:	add	x8, sp, #12
+; CHECK-NEXT:	mov.s	v0[2], v2[0]
+; CHECK-NEXT:	ldr	s2, [sp, #32]
+; CHECK-NEXT:	mov.s	v4[2], v6[0]
+; CHECK-NEXT:	mov.s	v0[3], v3[0]
+; CHECK-NEXT:	mov.s	v4[3], v7[0]
+; CHECK-NEXT:	mov	d3, v4[1]
+; CHECK-NEXT:	mov	d5, v0[1]
+; CHECK-NEXT:	ld1.s	{ v1 }[3], [x8]
+; CHECK-NEXT:	stnp	d4, d3, [x0, #16]
+; CHECK-NEXT:	stnp	d0, d5, [x0]
+; CHECK-NEXT:	mov	d0, v16[1]
+; CHECK-NEXT:	mov	d3, v1[1]
+; CHECK-NEXT:	stnp	d16, d0, [x0, #48]
+; CHECK-NEXT:	stnp	d1, d3, [x0, #32]
+; CHECK-NEXT:	str	s2, [x0, #64]
+; CHECK-NEXT:	ret
+
+entry:
+  store <17 x float> %v, <17 x float>* %ptr, align 4, !nontemporal !0
+  ret void
+}
+define void @test_stnp_v16i32_invalid_offset(<16 x i32> %v, <16 x i32>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16i32_invalid_offset:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    mov w8, #32000
+; CHECK-NEXT:    mov w9, #32032
+; CHECK-NEXT:    add x8, x0, x8
+; CHECK-NEXT:    add x9, x0, x9
+; CHECK-NEXT:    stnp    q2, q3, [x9]
+; CHECK-NEXT:    stnp    q0, q1, [x8]
+; CHECK-NEXT:    ret
+
+entry:
+  %gep = getelementptr <16 x i32>, <16 x i32>* %ptr, i32 500
+  store <16 x i32> %v, <16 x i32>* %gep, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v16f64(<16 x double> %v, <16 x double>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16f64:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    stnp    q6, q7, [x0, #96]
+; CHECK-NEXT:    stnp    q4, q5, [x0, #64]
+; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
+; CHECK-NEXT:    stnp    q0, q1, [x0]
+; CHECK-NEXT:    ret
+
+entry:
+  store <16 x double> %v, <16 x double>* %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v16i64(<16 x i64> %v, <16 x i64>* %ptr) {
+; CHECK-LABEL: _test_stnp_v16i64:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    stnp    q6, q7, [x0, #96]
+; CHECK-NEXT:    stnp    q4, q5, [x0, #64]
+; CHECK-NEXT:    stnp    q2, q3, [x0, #32]
+; CHECK-NEXT:    stnp    q0, q1, [x0]
+; CHECK-NEXT:    ret
+
+entry:
+  store <16 x i64> %v, <16 x i64>* %ptr, align 4, !nontemporal !0
+  ret void
+}
+
 !0 = !{ i32 1 }
 
 attributes #0 = { nounwind }


        


More information about the llvm-commits mailing list