[llvm-commits] [llvm] r154340 - in /llvm/trunk: lib/Target/ARM/ARMISelLowering.cpp test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll test/CodeGen/ARM/opt-shuff-tstore.ll test/CodeGen/ARM/vrev.ll

Mon Apr 9 14:33:02 PDT 2012

On Apr 9, 2012, at 1:32 PM, Chad Rosier <mcrosier at apple.com> wrote:

> Author: mcrosier
> Date: Mon Apr  9 15:32:02 2012
> New Revision: 154340
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=154340&view=rev
> Log:
> When performing a truncating store, it's possible to rearrange the data 
> in-register, such that we can use a single vector store rather then a 
> series of scalar stores.
> 
> For func_4_8 the generated code
> 
> 	vldr	d16, LCPI0_0
> 	vmov	d17, r0, r1
> 	vadd.i16	d16, d17, d16
> 	vmov.u16	r0, d16[3]
> 	strb	r0, [r2, #3]
> 	vmov.u16	r0, d16[2]
> 	strb	r0, [r2, #2]
> 	vmov.u16	r0, d16[1]
> 	strb	r0, [r2, #1]
> 	vmov.u16	r0, d16[0]
> 	strb	r0, [r2]
> 	bx	lr
> 
> becomes
> 
> 	vldr	d16, LCPI0_0
> 	vmov	d17, r0, r1
> 	vadd.i16	d16, d17, d16
> 	vuzp.8	d16, d17
> 	vst1.32	{d16[0]}, [r2, :32]
> 	bx	lr
> 
> I'm not fond of how this combine pessimizes 2012-03-13-DAGCombineBug.ll,
> but I couldn't think of a way to judiciously apply this combine.
> 
> This
> 
> 	ldrh	r0, [r0, #4]
> 	strh	r0, [r1]
> 
> becomes
> 
> 	vldr	d16, [r0]
> 	vmov.u16	r0, d16[2]
> 	vmov.32	d16[0], r0
> 	vuzp.16	d16, d17
> 	vst1.32	{d16[0]}, [r1, :32]

This worries me. We're now touching more memory than we were before. Can we perhaps use that sort of information to fine-tune when to fire the combine?

> 
> PR11158
> rdar://10703339
> 
> Added:
>    llvm/trunk/test/CodeGen/ARM/opt-shuff-tstore.ll
> Modified:
>    llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
>    llvm/trunk/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll
>    llvm/trunk/test/CodeGen/ARM/vrev.ll
> 
> Modified: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp?rev=154340&r1=154339&r2=154340&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp Mon Apr  9 15:32:02 2012
> @@ -7339,8 +7339,92 @@
> static SDValue PerformSTORECombine(SDNode *N,
>                                    TargetLowering::DAGCombinerInfo &DCI) {
>   StoreSDNode *St = cast<StoreSDNode>(N);
> +  if (St->isVolatile())
> +    return SDValue();
> +
> +  // Optimize trunc store (of multiple scalars) to shuffle and store.  First, 
> +  // pack all of the elements in one place.  Next, store to memory in fewer
> +  // chunks.
>   SDValue StVal = St->getValue();
> -  if (!ISD::isNormalStore(St) || St->isVolatile())
> +  EVT VT = StVal.getValueType();
> +  if (St->isTruncatingStore() && VT.isVector()) {
> +    SelectionDAG &DAG = DCI.DAG;
> +    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
> +    EVT StVT = St->getMemoryVT();
> +    unsigned NumElems = VT.getVectorNumElements();
> +    assert(StVT != VT && "Cannot truncate to the same type");
> +    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
> +    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
> +
> +    // From, To sizes and ElemCount must be pow of two
> +    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
> +
> +    // We are going to use the original vector elt for storing.
> +    // Accumulated smaller vector elements must be a multiple of the store size.
> +    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
> +
> +    unsigned SizeRatio  = FromEltSz / ToEltSz;
> +    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
> +
> +    // Create a type on which we perform the shuffle.
> +    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
> +                                     NumElems*SizeRatio);
> +    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
> +
> +    DebugLoc DL = St->getDebugLoc();
> +    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
> +    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
> +    for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
> +
> +    // Can't shuffle using an illegal type.
> +    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
> +
> +    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
> +                                DAG.getUNDEF(WideVec.getValueType()),
> +                                ShuffleVec.data());
> +    // At this point all of the data is stored at the bottom of the
> +    // register. We now need to save it to mem.
> +
> +    // Find the largest store unit
> +    MVT StoreType = MVT::i8;
> +    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
> +         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
> +      MVT Tp = (MVT::SimpleValueType)tp;
> +      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
> +        StoreType = Tp;
> +    }
> +    // Didn't find a legal store type.
> +    if (!TLI.isTypeLegal(StoreType))
> +      return SDValue();
> +
> +    // Bitcast the original vector into a vector of store-size units
> +    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
> +            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
> +    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
> +    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
> +    SmallVector<SDValue, 8> Chains;
> +    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
> +                                        TLI.getPointerTy());
> +    SDValue BasePtr = St->getBasePtr();
> +
> +    // Perform one or more big stores into memory.
> +    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
> +    for (unsigned I = 0; I < E; I++) {
> +      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
> +                                   StoreType, ShuffWide,
> +                                   DAG.getIntPtrConstant(I));
> +      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
> +                                St->getPointerInfo(), St->isVolatile(),
> +                                St->isNonTemporal(), St->getAlignment());
> +      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
> +                            Increment);
> +      Chains.push_back(Ch);
> +    }
> +    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
> +                       Chains.size());
> +  }
> +
> +  if (!ISD::isNormalStore(St))
>     return SDValue();
> 
>   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
> 
> Modified: llvm/trunk/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll?rev=154340&r1=154339&r2=154340&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM/2012-03-13-DAGCombineBug.ll Mon Apr  9 15:32:02 2012
> @@ -6,8 +6,7 @@
> ; (i32 extload $addr+c*sizeof(i16)
> define void @test_hi_short3(<3 x i16> * nocapture %srcA, <2 x i16> * nocapture %dst) nounwind {
> entry:
> -; CHECK: ldrh [[REG:r[0-9]+]]
> -; CHECK: strh [[REG]]
> +; CHECK: vst1.32
>   %0 = load <3 x i16> * %srcA, align 8
>   %1 = shufflevector <3 x i16> %0, <3 x i16> undef, <2 x i32> <i32 2, i32 undef>
>   store <2 x i16> %1, <2 x i16> * %dst, align 4
> 
> Added: llvm/trunk/test/CodeGen/ARM/opt-shuff-tstore.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/opt-shuff-tstore.ll?rev=154340&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/opt-shuff-tstore.ll (added)
> +++ llvm/trunk/test/CodeGen/ARM/opt-shuff-tstore.ll Mon Apr  9 15:32:02 2012
> @@ -0,0 +1,19 @@
> +; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -promote-elements -mattr=+neon < %s | FileCheck %s
> +
> +; CHECK: func_4_8
> +; CHECK: vst1.32
> +; CHECK-NEXT: bx lr
> +define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
> +  %r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
> +  store <4 x i8> %r, <4 x i8>* %p
> +  ret void
> +}
> +
> +; CHECK: func_2_16
> +; CHECK: vst1.32
> +; CHECK-NEXT: bx lr
> +define void @func_2_16(<2 x i16> %param, <2 x i16>* %p) {
> +  %r = add <2 x i16> %param, <i16 1, i16 2>
> +  store <2 x i16> %r, <2 x i16>* %p
> +  ret void
> +}
> 
> Modified: llvm/trunk/test/CodeGen/ARM/vrev.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/vrev.ll?rev=154340&r1=154339&r2=154340&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/ARM/vrev.ll (original)
> +++ llvm/trunk/test/CodeGen/ARM/vrev.ll Mon Apr  9 15:32:02 2012
> @@ -149,12 +149,10 @@
> }
> 
> ; The type <2 x i16> is legalized to <2 x i32> and need to be trunc-stored
> -; to <2 x i16> when stored to memory. Currently ARM scalarizes these stores.
> -; See PR 11158
> +; to <2 x i16> when stored to memory.
> define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
> ; CHECK: test_vrev64:
> -; CHECK: vst1.16
> -; CHECK: vst1.16
> +; CHECK: vst1.32
> entry:
>   %0 = bitcast <4 x i16>* %source to <8 x i16>*
>   %tmp2 = load <8 x i16>* %0, align 4
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits