[llvm] r372717 - [ARM] Split large truncating MVE stores
David Green via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 24 03:10:41 PDT 2019
Author: dmgreen
Date: Tue Sep 24 03:10:41 2019
New Revision: 372717
URL: http://llvm.org/viewvc/llvm-project?rev=372717&view=rev
Log:
[ARM] Split large truncating MVE stores
MVE does not have a simple sign extend instruction that can move elements
across lanes. We currently often end up moving each lane into and out of a GPR,
in order to get elements into the correct places. When we have a store of a
trunc (or a extend of a load), we can instead just split the store/load in two,
using the narrowing/widening load/store instructions from each half of the
vector.
This does that for stores. It happens very early in a store combine, so as to
easily detect the truncates. (It would be possible to do this later, but that
would involve looking through a buildvector of extract elements. Not impossible
but this way seemed simpler).
By enabling store combines we also get a vmovdrr combine for free, helping some
other tests.
Differential Revision: https://reviews.llvm.org/D67828
Modified:
llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
llvm/trunk/test/CodeGen/Thumb2/float-ops.ll
llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll
Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=372717&r1=372716&r2=372717&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Tue Sep 24 03:10:41 2019
@@ -901,7 +901,6 @@ ARMTargetLowering::ARMTargetLowering(con
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
- setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
setTargetDAGCombine(ISD::FDIV);
@@ -922,6 +921,7 @@ ARMTargetLowering::ARMTargetLowering(con
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::STORE);
}
if (!Subtarget->hasFP64()) {
@@ -13120,95 +13120,161 @@ static SDValue PerformLOADCombine(SDNode
return SDValue();
}
+// Optimize trunc store (of multiple scalars) to shuffle and store. First,
+// pack all of the elements in one place. Next, store to memory in fewer
+// chunks.
+SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG) {
+ SDValue StVal = St->getValue();
+ EVT VT = StVal.getValueType();
+ if (!St->isTruncatingStore() || !VT.isVector())
+ return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT StVT = St->getMemoryVT();
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromEltSz = VT.getScalarSizeInBits();
+ unsigned ToEltSz = StVT.getScalarSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
+ return SDValue();
+
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ if (0 != (NumElems * FromEltSz) % ToEltSz)
+ return SDValue();
+
+ unsigned SizeRatio = FromEltSz / ToEltSz;
+ assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle.
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+ NumElems * SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDLoc DL(St);
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i < NumElems; ++i)
+ ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
+ : i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
+
+ SDValue Shuff = DAG.getVectorShuffle(
+ WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
+
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (MVT Tp : MVT::integer_valuetypes()) {
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+ StoreType = Tp;
+ }
+ // Didn't find a legal store type.
+ if (!TLI.isTypeLegal(StoreType))
+ return SDValue();
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT =
+ EVT::getVectorVT(*DAG.getContext(), StoreType,
+ VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
+ TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue BasePtr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
+ for (unsigned I = 0; I < E; I++) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
+ ShuffWide, DAG.getIntPtrConstant(I, DL));
+ SDValue Ch =
+ DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
+ St->getAlignment(), St->getMemOperand()->getFlags());
+ BasePtr =
+ DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
+ Chains.push_back(Ch);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+}
+
+// Try taking a single vector store from an truncate (which would otherwise turn
+// into an expensive buildvector) and splitting it into a series of narrowing
+// stores.
+SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG) {
+ if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
+ return SDValue();
+ SDValue Trunc = St->getValue();
+ if (Trunc->getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+ EVT FromVT = Trunc->getOperand(0).getValueType();
+ EVT ToVT = Trunc.getValueType();
+ if (!ToVT.isVector())
+ return SDValue();
+ assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
+ EVT ToEltVT = ToVT.getVectorElementType();
+ EVT FromEltVT = FromVT.getVectorElementType();
+
+ unsigned NumElements = 0;
+ if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8))
+ NumElements = 4;
+ if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
+ NumElements = 8;
+ if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
+ FromVT.getVectorNumElements() % NumElements != 0)
+ return SDValue();
+
+ SDLoc DL(St);
+ // Details about the old store
+ SDValue Ch = St->getChain();
+ SDValue BasePtr = St->getBasePtr();
+ unsigned Alignment = St->getOriginalAlignment();
+ MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
+ AAMDNodes AAInfo = St->getAAInfo();
+
+ EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
+ EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
+
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
+ unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
+ SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
+ DAG.getConstant(i * NumElements, DL, MVT::i32));
+ SDValue Store = DAG.getTruncStore(
+ Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
+ NewToVT, Alignment, MMOFlags, AAInfo);
+ Stores.push_back(Store);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
/// PerformSTORECombine - Target-specific dag combine xforms for
/// ISD::STORE.
static SDValue PerformSTORECombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);
if (St->isVolatile())
return SDValue();
-
- // Optimize trunc store (of multiple scalars) to shuffle and store. First,
- // pack all of the elements in one place. Next, store to memory in fewer
- // chunks.
SDValue StVal = St->getValue();
EVT VT = StVal.getValueType();
- if (St->isTruncatingStore() && VT.isVector()) {
- SelectionDAG &DAG = DCI.DAG;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT StVT = St->getMemoryVT();
- unsigned NumElems = VT.getVectorNumElements();
- assert(StVT != VT && "Cannot truncate to the same type");
- unsigned FromEltSz = VT.getScalarSizeInBits();
- unsigned ToEltSz = StVT.getScalarSizeInBits();
-
- // From, To sizes and ElemCount must be pow of two
- if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
-
- // We are going to use the original vector elt for storing.
- // Accumulated smaller vector elements must be a multiple of the store size.
- if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
-
- unsigned SizeRatio = FromEltSz / ToEltSz;
- assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
-
- // Create a type on which we perform the shuffle.
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
- NumElems*SizeRatio);
- assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
- SDLoc DL(St);
- SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
- SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i < NumElems; ++i)
- ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
- ? (i + 1) * SizeRatio - 1
- : i * SizeRatio;
-
- // Can't shuffle using an illegal type.
- if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
-
- SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
- DAG.getUNDEF(WideVec.getValueType()),
- ShuffleVec);
- // At this point all of the data is stored at the bottom of the
- // register. We now need to save it to mem.
-
- // Find the largest store unit
- MVT StoreType = MVT::i8;
- for (MVT Tp : MVT::integer_valuetypes()) {
- if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
- StoreType = Tp;
- }
- // Didn't find a legal store type.
- if (!TLI.isTypeLegal(StoreType))
- return SDValue();
-
- // Bitcast the original vector into a vector of store-size units
- EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
- StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
- assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
- SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
- SmallVector<SDValue, 8> Chains;
- SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
- TLI.getPointerTy(DAG.getDataLayout()));
- SDValue BasePtr = St->getBasePtr();
+ if (Subtarget->hasNEON())
+ if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
+ return Store;
- // Perform one or more big stores into memory.
- unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
- for (unsigned I = 0; I < E; I++) {
- SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
- StoreType, ShuffWide,
- DAG.getIntPtrConstant(I, DL));
- SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
- BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
- Increment);
- Chains.push_back(Ch);
- }
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
- }
+ if (Subtarget->hasMVEIntegerOps())
+ if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
+ return NewToken;
if (!ISD::isNormalStore(St))
return SDValue();
@@ -13260,7 +13326,7 @@ static SDValue PerformSTORECombine(SDNod
}
// If this is a legal vector store, try to combine it into a VST1_UPD.
- if (ISD::isNormalStore(N) && VT.isVector() &&
+ if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
return CombineBaseUpdate(N, DCI);
@@ -14170,7 +14236,7 @@ SDValue ARMTargetLowering::PerformDAGCom
case ARMISD::BFI: return PerformBFICombine(N, DCI);
case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
- case ISD::STORE: return PerformSTORECombine(N, DCI);
+ case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
Modified: llvm/trunk/test/CodeGen/Thumb2/float-ops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/float-ops.ll?rev=372717&r1=372716&r2=372717&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/float-ops.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/float-ops.ll Tue Sep 24 03:10:41 2019
@@ -130,7 +130,7 @@ define void @store_d(double* %a, double
entry:
; CHECK-LABEL: store_d:
; NOREGS: strd r2, r3, [r0]
-; ONLYREGS: vstr d0, [r0]
+; ONLYREGS: strd r2, r3, [r0]
; HARD: vstr d0, [r0]
store double %b, double* %a, align 8
ret void
Modified: llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll?rev=372717&r1=372716&r2=372717&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/mve-masked-store.ll Tue Sep 24 03:10:41 2019
@@ -521,15 +521,11 @@ define arm_aapcs_vfpcc void @masked_v2i6
; CHECK-LE-NEXT: bfi r3, r1, #0, #1
; CHECK-LE-NEXT: and r1, r3, #3
; CHECK-LE-NEXT: lsls r2, r3, #31
-; CHECK-LE-NEXT: ittt ne
-; CHECK-LE-NEXT: vmovne r2, s1
-; CHECK-LE-NEXT: vmovne r3, s0
-; CHECK-LE-NEXT: strdne r3, r2, [r0]
+; CHECK-LE-NEXT: it ne
+; CHECK-LE-NEXT: vstrne d0, [r0]
; CHECK-LE-NEXT: lsls r1, r1, #30
-; CHECK-LE-NEXT: ittt mi
-; CHECK-LE-NEXT: vmovmi r1, s3
-; CHECK-LE-NEXT: vmovmi r2, s2
-; CHECK-LE-NEXT: strdmi r2, r1, [r0, #8]
+; CHECK-LE-NEXT: it mi
+; CHECK-LE-NEXT: vstrmi d1, [r0, #8]
; CHECK-LE-NEXT: add sp, #4
; CHECK-LE-NEXT: bx lr
;
@@ -558,25 +554,11 @@ define arm_aapcs_vfpcc void @masked_v2i6
; CHECK-BE-NEXT: bfi r3, r1, #0, #1
; CHECK-BE-NEXT: and r1, r3, #3
; CHECK-BE-NEXT: lsls r2, r3, #31
-; CHECK-BE-NEXT: bne .LBB19_3
-; CHECK-BE-NEXT: @ %bb.1: @ %else
-; CHECK-BE-NEXT: lsls r1, r1, #30
-; CHECK-BE-NEXT: bmi .LBB19_4
-; CHECK-BE-NEXT: .LBB19_2: @ %else2
-; CHECK-BE-NEXT: add sp, #4
-; CHECK-BE-NEXT: bx lr
-; CHECK-BE-NEXT: .LBB19_3: @ %cond.store
-; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: vmov r2, s5
-; CHECK-BE-NEXT: vmov r3, s4
-; CHECK-BE-NEXT: strd r3, r2, [r0]
+; CHECK-BE-NEXT: it ne
+; CHECK-BE-NEXT: vstrne d0, [r0]
; CHECK-BE-NEXT: lsls r1, r1, #30
-; CHECK-BE-NEXT: bpl .LBB19_2
-; CHECK-BE-NEXT: .LBB19_4: @ %cond.store1
-; CHECK-BE-NEXT: vrev64.32 q1, q0
-; CHECK-BE-NEXT: vmov r1, s7
-; CHECK-BE-NEXT: vmov r2, s6
-; CHECK-BE-NEXT: strd r2, r1, [r0, #8]
+; CHECK-BE-NEXT: it mi
+; CHECK-BE-NEXT: vstrmi d1, [r0, #8]
; CHECK-BE-NEXT: add sp, #4
; CHECK-BE-NEXT: bx lr
entry:
More information about the llvm-commits
mailing list